diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,15217 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.999111111111111,
+  "eval_steps": 500,
+  "global_step": 562,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 1020.078125,
+      "completions/mean_terminated_length": 773.0,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "epoch": 0.0035555555555555557,
+      "grad_norm": 0.2771470571706861,
+      "kl": 0.0,
+      "learning_rate": 0.0,
+      "loss": 0.0,
+      "num_tokens": 149594.0,
+      "reward": 0.0078125,
+      "reward_std": 0.009021097794175148,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.0071111111111111115,
+      "grad_norm": 0.0,
+      "kl": 0.0,
+      "learning_rate": 2.941176470588235e-08,
+      "loss": 0.0,
+      "num_tokens": 299738.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 1014.9609375,
+      "completions/mean_terminated_length": 445.5,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.3910424013646084,
+      "kl": 0.0012340545654296875,
+      "learning_rate": 5.88235294117647e-08,
+      "loss": 0.0,
+      "num_tokens": 448737.0,
+      "reward": 0.01171875,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 564.0,
+      "completions/mean_length": 1015.1328125,
+      "completions/mean_terminated_length": 456.5,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "epoch": 0.014222222222222223,
+      "grad_norm": 0.7477560337297564,
+      "kl": 0.0012416839599609375,
+      "learning_rate": 8.823529411764706e-08,
+      "loss": 0.0,
+      "num_tokens": 597734.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.017777777777777778,
+      "grad_norm": 0.004210061181011468,
+      "kl": 0.0011844635009765625,
+      "learning_rate": 1.176470588235294e-07,
+      "loss": 0.0,
+      "num_tokens": 747806.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.30463059980407714,
+      "kl": 0.0012011528015136719,
+      "learning_rate": 1.4705882352941175e-07,
+      "loss": 0.0,
+      "num_tokens": 897894.0,
+      "reward": 0.00390625,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.024888888888888887,
+      "grad_norm": 0.9478569692121466,
+      "kl": 0.0013608932495117188,
+      "learning_rate": 1.764705882352941e-07,
+      "loss": 0.0,
+      "num_tokens": 1048030.0,
+      "reward": 0.00390625,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 1023.3125,
+      "completions/mean_terminated_length": 936.0,
+      "completions/min_length": 936.0,
+      "completions/min_terminated_length": 936.0,
+      "epoch": 0.028444444444444446,
+      "grad_norm": 0.0033097224749988386,
+      "kl": 0.0013580322265625,
+      "learning_rate": 2.0588235294117645e-07,
+      "loss": 0.0,
+      "num_tokens": 1198082.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.032,
+      "grad_norm": 0.3396430847864516,
+      "kl": 0.0011739730834960938,
+      "learning_rate": 2.352941176470588e-07,
+      "loss": 0.0,
+      "num_tokens": 1348102.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.035555555555555556,
+      "grad_norm": 0.0019085271667908846,
+      "kl": 0.0013241767883300781,
+      "learning_rate": 2.6470588235294114e-07,
+      "loss": 0.0,
+      "num_tokens": 1498154.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.03911111111111111,
+      "grad_norm": 0.12196802997866993,
+      "kl": 0.003757476806640625,
+      "learning_rate": 2.941176470588235e-07,
+      "loss": 0.0,
+      "num_tokens": 1648294.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 779.0,
+      "completions/mean_length": 1022.0859375,
+      "completions/mean_terminated_length": 779.0,
+      "completions/min_length": 779.0,
+      "completions/min_terminated_length": 779.0,
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.353412956717465,
+      "kl": 0.0011844635009765625,
+      "learning_rate": 3.2352941176470586e-07,
+      "loss": 0.0,
+      "num_tokens": 1798157.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9765625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 1015.5,
+      "completions/mean_terminated_length": 661.3333740234375,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 0.04622222222222222,
+      "grad_norm": 0.4177192368926344,
+      "kl": 0.0013208389282226562,
+      "learning_rate": 3.529411764705882e-07,
+      "loss": 0.0019,
+      "num_tokens": 1947113.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 1019.8984375,
+      "completions/mean_terminated_length": 499.0,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "epoch": 0.049777777777777775,
+      "grad_norm": 1.030360015389124,
+      "kl": 0.0012826919555664062,
+      "learning_rate": 3.8235294117647053e-07,
+      "loss": 0.0,
+      "num_tokens": 2096668.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.006130423266424807,
+      "kl": 0.001644134521484375,
+      "learning_rate": 4.117647058823529e-07,
+      "loss": 0.0,
+      "num_tokens": 2246772.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.05688888888888889,
+      "grad_norm": 0.47084182350703124,
+      "kl": 0.0013189315795898438,
+      "learning_rate": 4.4117647058823526e-07,
+      "loss": 0.0,
+      "num_tokens": 2396868.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.060444444444444446,
+      "grad_norm": 0.39485183865726337,
+      "kl": 0.0012559890747070312,
+      "learning_rate": 4.705882352941176e-07,
+      "loss": 0.0,
+      "num_tokens": 2546980.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 384.0,
+      "completions/mean_length": 1019.0,
+      "completions/mean_terminated_length": 384.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "epoch": 0.064,
+      "grad_norm": 0.5277276749582551,
+      "kl": 0.0015764236450195312,
+      "learning_rate": 5e-07,
+      "loss": 0.0,
+      "num_tokens": 2696456.0,
+      "reward": 0.01171875,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 922.0,
+      "completions/mean_length": 1023.203125,
+      "completions/mean_terminated_length": 922.0,
+      "completions/min_length": 922.0,
+      "completions/min_terminated_length": 922.0,
+      "epoch": 0.06755555555555555,
+      "grad_norm": 0.6327836514564609,
+      "kl": 0.0015048980712890625,
+      "learning_rate": 4.999958464872182e-07,
+      "loss": 0.0,
+      "num_tokens": 2846446.0,
+      "reward": 0.01171875,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 974.0,
+      "completions/mean_length": 1023.609375,
+      "completions/mean_terminated_length": 974.0,
+      "completions/min_length": 974.0,
+      "completions/min_terminated_length": 974.0,
+      "epoch": 0.07111111111111111,
+      "grad_norm": 0.3191632805099472,
+      "kl": 0.0015163421630859375,
+      "learning_rate": 4.999833860868863e-07,
+      "loss": 0.0,
+      "num_tokens": 2996508.0,
+      "reward": 0.00390625,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 612.0,
+      "completions/mean_length": 1020.78125,
+      "completions/mean_terminated_length": 612.0,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.5895570635639454,
+      "kl": 0.00185394287109375,
+      "learning_rate": 4.999626192130396e-07,
+      "loss": 0.0,
+      "num_tokens": 3146148.0,
+      "reward": 0.015625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 185.0,
+      "completions/mean_length": 1017.4453125,
+      "completions/mean_terminated_length": 185.0,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.07822222222222222,
+      "grad_norm": 0.8536716379125333,
+      "kl": 0.002094268798828125,
+      "learning_rate": 4.99933546555722e-07,
+      "loss": -0.0036,
+      "num_tokens": 3295353.0,
+      "reward": 0.0234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 1022.59375,
+      "completions/mean_terminated_length": 844.0,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "epoch": 0.08177777777777778,
+      "grad_norm": 0.761179697351255,
+      "kl": 0.0025005340576171875,
+      "learning_rate": 4.998961690809627e-07,
+      "loss": -0.0012,
+      "num_tokens": 3445333.0,
+      "reward": 0.015625,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.9164427994280226,
+      "kl": 0.0030078887939453125,
+      "learning_rate": 4.998504880307444e-07,
+      "loss": 0.0,
+      "num_tokens": 3595429.0,
+      "reward": 0.03125,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 1021.15625,
+      "completions/mean_terminated_length": 660.0,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.49620338336363207,
+      "kl": 0.0037441253662109375,
+      "learning_rate": 4.997965049229614e-07,
+      "loss": 0.0,
+      "num_tokens": 3745113.0,
+      "reward": 0.01171875,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.09244444444444444,
+      "grad_norm": 0.9470188431084309,
+      "kl": 0.0042877197265625,
+      "learning_rate": 4.997342215513703e-07,
+      "loss": 0.0,
+      "num_tokens": 3895201.0,
+      "reward": 0.0234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 1017.71875,
+      "completions/mean_terminated_length": 622.0,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "epoch": 0.096,
+      "grad_norm": 0.9027646554963968,
+      "kl": 0.005214691162109375,
+      "learning_rate": 4.99663639985529e-07,
+      "loss": -0.0028,
+      "num_tokens": 4044461.0,
+      "reward": 0.0234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9765625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 1016.2734375,
+      "completions/mean_terminated_length": 694.3333740234375,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "epoch": 0.09955555555555555,
+      "grad_norm": 1.1212987874003704,
+      "kl": 0.0063610076904296875,
+      "learning_rate": 4.995847625707292e-07,
+      "loss": -0.0012,
+      "num_tokens": 4193588.0,
+      "reward": 0.03515625,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.10311111111111111,
+      "grad_norm": 0.5105677644040013,
+      "kl": 0.0079345703125,
+      "learning_rate": 4.994975919279175e-07,
+      "loss": 0.0,
+      "num_tokens": 4343632.0,
+      "reward": 0.0078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.40495448135577117,
+      "kl": 0.010833740234375,
+      "learning_rate": 4.994021309536092e-07,
+      "loss": 0.0,
+      "num_tokens": 4493784.0,
+      "reward": 0.01171875,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 806.0,
+      "completions/mean_length": 1022.296875,
+      "completions/mean_terminated_length": 806.0,
+      "completions/min_length": 806.0,
+      "completions/min_terminated_length": 806.0,
+      "epoch": 0.11022222222222222,
+      "grad_norm": 1.1342927264328857,
+      "kl": 0.010311126708984375,
+      "learning_rate": 4.992983828197911e-07,
+      "loss": 0.0,
+      "num_tokens": 4643598.0,
+      "reward": 0.0234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.11377777777777778,
+      "grad_norm": 0.7512859872904276,
+      "kl": 0.01190948486328125,
+      "learning_rate": 4.991863509738169e-07,
+      "loss": 0.0,
+      "num_tokens": 4793666.0,
+      "reward": 0.04296875,
+      "reward_std": 0.0661257952451706,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 1019.96875,
+      "completions/mean_terminated_length": 508.0,
+      "completions/min_length": 508.0,
+      "completions/min_terminated_length": 508.0,
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.9416577312187854,
+      "kl": 0.02165985107421875,
+      "learning_rate": 4.990660391382923e-07,
+      "loss": -0.0021,
+      "num_tokens": 4943234.0,
+      "reward": 0.046875,
+      "reward_std": 0.05831329524517059,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.0234375,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.12088888888888889,
+      "grad_norm": 0.7545248940723044,
+      "kl": 0.0414276123046875,
+      "learning_rate": 4.989374513109511e-07,
+      "loss": 0.0,
+      "num_tokens": 5093326.0,
+      "reward": 0.0390625,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.12444444444444444,
+      "grad_norm": 0.7218584701517031,
+      "kl": 0.0307769775390625,
+      "learning_rate": 4.988005917645229e-07,
+      "loss": 0.0,
+      "num_tokens": 5243446.0,
+      "reward": 0.03515625,
+      "reward_std": 0.057104695588350296,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.03125,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.128,
+      "grad_norm": 0.8075422097889196,
+      "kl": 0.043731689453125,
+      "learning_rate": 4.986554650465906e-07,
+      "loss": 0.0,
+      "num_tokens": 5393610.0,
+      "reward": 0.02734375,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.0078125,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.13155555555555556,
+      "grad_norm": 0.6475461270291981,
+      "kl": 0.044952392578125,
+      "learning_rate": 4.985020759794397e-07,
+      "loss": 0.0,
+      "num_tokens": 5543710.0,
+      "reward": 0.03515625,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.1351111111111111,
+      "grad_norm": 0.754339390837337,
+      "kl": 0.0526123046875,
+      "learning_rate": 4.983404296598978e-07,
+      "loss": 0.0001,
+      "num_tokens": 5693790.0,
+      "reward": 0.06640625,
+      "reward_std": 0.0973757952451706,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.0390625,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.7041423598747788,
+      "kl": 0.057220458984375,
+      "learning_rate": 4.981705314591655e-07,
+      "loss": 0.0001,
+      "num_tokens": 5843866.0,
+      "reward": 0.05859375,
+      "reward_std": 0.07272969186306,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.0390625,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.14222222222222222,
+      "grad_norm": 0.649800348452034,
+      "kl": 0.037445068359375,
+      "learning_rate": 4.979923870226372e-07,
+      "loss": 0.0,
+      "num_tokens": 5994018.0,
+      "reward": 0.0546875,
+      "reward_std": 0.07767495512962341,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.015625,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 673.0,
+      "completions/mean_length": 1021.2578125,
+      "completions/mean_terminated_length": 673.0,
+      "completions/min_length": 673.0,
+      "completions/min_terminated_length": 673.0,
+      "epoch": 0.14577777777777778,
+      "grad_norm": 0.9175028834738074,
+      "kl": 0.047149658203125,
+      "learning_rate": 4.978060022697148e-07,
+      "loss": 0.0,
+      "num_tokens": 6143759.0,
+      "reward": 0.1015625,
+      "reward_std": 0.12323048710823059,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.09375,
+      "rewards/format_reward_func/std": 0.29262590408325195,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 700.0,
+      "completions/mean_length": 1021.46875,
+      "completions/mean_terminated_length": 700.0,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.8957971400137922,
+      "kl": 0.052337646484375,
+      "learning_rate": 4.976113833936098e-07,
+      "loss": 0.0039,
+      "num_tokens": 6293559.0,
+      "reward": 0.1015625,
+      "reward_std": 0.13169725239276886,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.078125,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.15288888888888888,
+      "grad_norm": 2.0122883915276466,
+      "kl": 0.170257568359375,
+      "learning_rate": 4.974085368611381e-07,
+      "loss": 0.0002,
+      "num_tokens": 6443715.0,
+      "reward": 0.0703125,
+      "reward_std": 0.09803006052970886,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.0703125,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.15644444444444444,
+      "grad_norm": 7.0766541269315875,
+      "kl": 0.728851318359375,
+      "learning_rate": 4.971974694125051e-07,
+      "loss": 0.0007,
+      "num_tokens": 6593715.0,
+      "reward": 0.140625,
+      "reward_std": 0.1269671469926834,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.1640625,
+      "rewards/format_reward_func/std": 0.371787428855896,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.16,
+      "grad_norm": 4.872646650485278,
+      "kl": 0.322509765625,
+      "learning_rate": 4.969781880610813e-07,
+      "loss": 0.0003,
+      "num_tokens": 6743827.0,
+      "reward": 0.1640625,
+      "reward_std": 0.19241680204868317,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.1796875,
+      "rewards/format_reward_func/std": 0.3854354918003082,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.16355555555555557,
+      "grad_norm": 1.0088881954713835,
+      "kl": 0.070465087890625,
+      "learning_rate": 4.967507000931702e-07,
+      "loss": 0.0001,
+      "num_tokens": 6893915.0,
+      "reward": 0.1953125,
+      "reward_std": 0.1971469223499298,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.1953125,
+      "rewards/format_reward_func/std": 0.3979988098144531,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.1671111111111111,
+      "grad_norm": 3.180876744240654,
+      "kl": 0.444976806640625,
+      "learning_rate": 4.965150130677651e-07,
+      "loss": 0.0004,
+      "num_tokens": 7043971.0,
+      "reward": 0.19921875,
+      "reward_std": 0.2139805108308792,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.21875,
+      "rewards/format_reward_func/std": 0.41502299904823303,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.17066666666666666,
+      "grad_norm": 1.2194553812166182,
+      "kl": 0.196197509765625,
+      "learning_rate": 4.962711348162987e-07,
+      "loss": 0.0002,
+      "num_tokens": 7194079.0,
+      "reward": 0.1640625,
+      "reward_std": 0.16161686182022095,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.171875,
+      "rewards/format_reward_func/std": 0.3787541687488556,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.17422222222222222,
+      "grad_norm": 1.3058480703859512,
+      "kl": 0.093231201171875,
+      "learning_rate": 4.960190734423824e-07,
+      "loss": 0.0001,
+      "num_tokens": 7344127.0,
+      "reward": 0.2109375,
+      "reward_std": 0.2336813509464264,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.434714138507843,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.8822642087267891,
+      "kl": 0.10333251953125,
+      "learning_rate": 4.957588373215373e-07,
+      "loss": 0.0001,
+      "num_tokens": 7494299.0,
+      "reward": 0.14453125,
+      "reward_std": 0.15854540467262268,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.1953125,
+      "rewards/format_reward_func/std": 0.3979988098144531,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.18133333333333335,
+      "grad_norm": 5.708085997498834,
+      "kl": 0.7578125,
+      "learning_rate": 4.954904351009156e-07,
+      "loss": 0.0008,
+      "num_tokens": 7644343.0,
+      "reward": 0.2421875,
+      "reward_std": 0.24566304683685303,
+      "rewards/equation_reward_func/mean": 0.203125,
+      "rewards/equation_reward_func/std": 0.40390563011169434,
+      "rewards/format_reward_func/mean": 0.28125,
+      "rewards/format_reward_func/std": 0.4513758420944214,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 526.0,
+      "completions/mean_length": 1020.109375,
+      "completions/mean_terminated_length": 526.0,
+      "completions/min_length": 526.0,
+      "completions/min_terminated_length": 526.0,
+      "epoch": 0.18488888888888888,
+      "grad_norm": 1.323493532471514,
+      "kl": 0.12872314453125,
+      "learning_rate": 4.952138756990142e-07,
+      "loss": 0.0001,
+      "num_tokens": 7793957.0,
+      "reward": 0.19921875,
+      "reward_std": 0.19263195991516113,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.265625,
+      "rewards/format_reward_func/std": 0.44340085983276367,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.18844444444444444,
+      "grad_norm": 1.4983556320698812,
+      "kl": 0.248046875,
+      "learning_rate": 4.949291683053768e-07,
+      "loss": 0.0002,
+      "num_tokens": 7943977.0,
+      "reward": 0.2109375,
+      "reward_std": 0.22092357277870178,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.434714138507843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.192,
+      "grad_norm": 2.4977361409936805,
+      "kl": 0.43695068359375,
+      "learning_rate": 4.946363223802901e-07,
+      "loss": 0.0004,
+      "num_tokens": 8094081.0,
+      "reward": 0.2734375,
+      "reward_std": 0.211696058511734,
+      "rewards/equation_reward_func/mean": 0.2265625,
+      "rewards/equation_reward_func/std": 0.4202519655227661,
+      "rewards/format_reward_func/mean": 0.3203125,
+      "rewards/format_reward_func/std": 0.4684300124645233,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 797.0,
+      "completions/mean_length": 1022.2265625,
+      "completions/mean_terminated_length": 797.0,
+      "completions/min_length": 797.0,
+      "completions/min_terminated_length": 797.0,
+      "epoch": 0.19555555555555557,
+      "grad_norm": 1.6221417605008561,
+      "kl": 0.24462890625,
+      "learning_rate": 4.943353476544681e-07,
+      "loss": -0.0012,
+      "num_tokens": 8243906.0,
+      "reward": 0.23046875,
+      "reward_std": 0.19329938292503357,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.3046875,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.1991111111111111,
+      "grad_norm": 2.6760159527432683,
+      "kl": 0.376220703125,
+      "learning_rate": 4.940262541287302e-07,
+      "loss": 0.0004,
+      "num_tokens": 8393978.0,
+      "reward": 0.21875,
+      "reward_std": 0.18384575843811035,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.28125,
+      "rewards/format_reward_func/std": 0.4513758420944214,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.20266666666666666,
+      "grad_norm": 22.524324011242694,
+      "kl": 2.3280029296875,
+      "learning_rate": 4.937090520736671e-07,
+      "loss": 0.0023,
+      "num_tokens": 8544074.0,
+      "reward": 0.1953125,
+      "reward_std": 0.22807088494300842,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.2734375,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.20622222222222222,
+      "grad_norm": 6.300577551599378,
+      "kl": 0.8486328125,
+      "learning_rate": 4.933837520293017e-07,
+      "loss": 0.0008,
+      "num_tokens": 8694182.0,
+      "reward": 0.22265625,
+      "reward_std": 0.20167279243469238,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.296875,
+      "rewards/format_reward_func/std": 0.45867621898651123,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.20977777777777779,
+      "grad_norm": 28.006158731494125,
+      "kl": 0.86181640625,
+      "learning_rate": 4.930503648047367e-07,
+      "loss": 0.0009,
+      "num_tokens": 8844278.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2949552536010742,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.3515625,
+      "rewards/format_reward_func/std": 0.4793342351913452,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 9.33584965819257,
+      "kl": 0.4361572265625,
+      "learning_rate": 4.927089014777972e-07,
+      "loss": 0.0004,
+      "num_tokens": 8994330.0,
+      "reward": 0.28125,
+      "reward_std": 0.24258936941623688,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.3671875,
+      "rewards/format_reward_func/std": 0.4839322865009308,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.21688888888888888,
+      "grad_norm": 6.303532871380138,
+      "kl": 0.6859130859375,
+      "learning_rate": 4.923593733946614e-07,
+      "loss": 0.0007,
+      "num_tokens": 9144434.0,
+      "reward": 0.296875,
+      "reward_std": 0.21597611904144287,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.421875,
+      "rewards/format_reward_func/std": 0.4957992732524872,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.22044444444444444,
+      "grad_norm": 8.57852289803169,
+      "kl": 1.74853515625,
+      "learning_rate": 4.920017921694841e-07,
+      "loss": 0.0018,
+      "num_tokens": 9294478.0,
+      "reward": 0.2890625,
+      "reward_std": 0.20475518703460693,
+      "rewards/equation_reward_func/mean": 0.2109375,
+      "rewards/equation_reward_func/std": 0.4095771610736847,
+      "rewards/format_reward_func/mean": 0.3671875,
+      "rewards/format_reward_func/std": 0.4839322865009308,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.224,
+      "grad_norm": 3.8660166772097195,
+      "kl": 0.3358154296875,
+      "learning_rate": 4.91636169684011e-07,
+      "loss": 0.0003,
+      "num_tokens": 9444610.0,
+      "reward": 0.2734375,
+      "reward_std": 0.23609855771064758,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.3828125,
+      "rewards/format_reward_func/std": 0.4879830479621887,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.22755555555555557,
+      "grad_norm": 12.93870306614523,
+      "kl": 1.1669921875,
+      "learning_rate": 4.912625180871833e-07,
+      "loss": 0.0012,
+      "num_tokens": 9594650.0,
+      "reward": 0.25390625,
+      "reward_std": 0.23785054683685303,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.4860251843929291,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2311111111111111,
+      "grad_norm": 7.511898616426601,
+      "kl": 0.55322265625,
+      "learning_rate": 4.908808497947346e-07,
+      "loss": 0.0006,
+      "num_tokens": 9744694.0,
+      "reward": 0.33984375,
+      "reward_std": 0.27659574151039124,
+      "rewards/equation_reward_func/mean": 0.234375,
+      "rewards/equation_reward_func/std": 0.42527204751968384,
+      "rewards/format_reward_func/mean": 0.4453125,
+      "rewards/format_reward_func/std": 0.4989531338214874,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.23466666666666666,
+      "grad_norm": 1.8438447972930012,
+      "kl": 0.2886962890625,
+      "learning_rate": 4.904911774887779e-07,
+      "loss": 0.0003,
+      "num_tokens": 9894774.0,
+      "reward": 0.3515625,
+      "reward_std": 0.24986067414283752,
+      "rewards/equation_reward_func/mean": 0.2890625,
+      "rewards/equation_reward_func/std": 0.45510825514793396,
+      "rewards/format_reward_func/mean": 0.4140625,
+      "rewards/format_reward_func/std": 0.49449479579925537,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 668.0,
+      "completions/mean_length": 1021.21875,
+      "completions/mean_terminated_length": 668.0,
+      "completions/min_length": 668.0,
+      "completions/min_terminated_length": 668.0,
+      "epoch": 0.23822222222222222,
+      "grad_norm": 1.4441757047110426,
+      "kl": 0.230224609375,
+      "learning_rate": 4.900935141173842e-07,
+      "loss": 0.0016,
+      "num_tokens": 10044558.0,
+      "reward": 0.234375,
+      "reward_std": 0.21178939938545227,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.3359375,
+      "rewards/format_reward_func/std": 0.47417303919792175,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.24177777777777779,
+      "grad_norm": 1.6691952805088401,
+      "kl": 0.329833984375,
+      "learning_rate": 4.896878728941531e-07,
+      "loss": 0.0003,
+      "num_tokens": 10194698.0,
+      "reward": 0.203125,
+      "reward_std": 0.20396818220615387,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.3046875,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.24533333333333332,
+      "grad_norm": 13.720929458457615,
+      "kl": 3.0303955078125,
+      "learning_rate": 4.892742672977722e-07,
+      "loss": 0.003,
+      "num_tokens": 10344786.0,
+      "reward": 0.3203125,
+      "reward_std": 0.25942516326904297,
+      "rewards/equation_reward_func/mean": 0.21875,
+      "rewards/equation_reward_func/std": 0.41502299904823303,
+      "rewards/format_reward_func/mean": 0.421875,
+      "rewards/format_reward_func/std": 0.4957992732524872,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.24888888888888888,
+      "grad_norm": 2.4164360870318733,
+      "kl": 0.5142822265625,
+      "learning_rate": 4.888527110715709e-07,
+      "loss": 0.0005,
+      "num_tokens": 10494850.0,
+      "reward": 0.29296875,
+      "reward_std": 0.2201562374830246,
+      "rewards/equation_reward_func/mean": 0.2109375,
+      "rewards/equation_reward_func/std": 0.4095771610736847,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.4860251843929291,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 552.0,
+      "completions/mean_length": 1020.3125,
+      "completions/mean_terminated_length": 552.0,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "epoch": 0.25244444444444447,
+      "grad_norm": 1.7293321306646259,
+      "kl": 0.32080078125,
+      "learning_rate": 4.884232182230623e-07,
+      "loss": -0.003,
+      "num_tokens": 10644490.0,
+      "reward": 0.234375,
+      "reward_std": 0.23798327147960663,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.328125,
+      "rewards/format_reward_func/std": 0.4713755249977112,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.256,
+      "grad_norm": 1.115711151199567,
+      "kl": 0.373046875,
+      "learning_rate": 4.879858030234789e-07,
+      "loss": 0.0004,
+      "num_tokens": 10794594.0,
+      "reward": 0.28125,
+      "reward_std": 0.17779618501663208,
+      "rewards/equation_reward_func/mean": 0.21875,
+      "rewards/equation_reward_func/std": 0.41502299904823303,
+      "rewards/format_reward_func/mean": 0.34375,
+      "rewards/format_reward_func/std": 0.47682511806488037,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.25955555555555554,
+      "grad_norm": 24.35412756965178,
+      "kl": 2.08587646484375,
+      "learning_rate": 4.875404800072976e-07,
+      "loss": 0.0021,
+      "num_tokens": 10944666.0,
+      "reward": 0.2109375,
+      "reward_std": 0.2127828449010849,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.3125,
+      "rewards/format_reward_func/std": 0.4653336703777313,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.26311111111111113,
+      "grad_norm": 1554.2762073710478,
+      "kl": 112.7879638671875,
+      "learning_rate": 4.870872639717572e-07,
+      "loss": 0.1126,
+      "num_tokens": 11094806.0,
+      "reward": 0.17578125,
+      "reward_std": 0.2021140456199646,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.28125,
+      "rewards/format_reward_func/std": 0.4513758420944214,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.26666666666666666,
+      "grad_norm": 1.1410728341145782,
+      "kl": 0.2967529296875,
+      "learning_rate": 4.866261699763664e-07,
+      "loss": 0.0003,
+      "num_tokens": 11244874.0,
+      "reward": 0.24609375,
+      "reward_std": 0.2407177835702896,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.3046875,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2702222222222222,
+      "grad_norm": 6.430383302085185,
+      "kl": 1.0128173828125,
+      "learning_rate": 4.861572133424035e-07,
+      "loss": 0.001,
+      "num_tokens": 11394958.0,
+      "reward": 0.28515625,
+      "reward_std": 0.2505149245262146,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.4860251843929291,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2737777777777778,
+      "grad_norm": 7.0576042073325,
+      "kl": 1.8004150390625,
+      "learning_rate": 4.856804096524078e-07,
+      "loss": 0.0018,
+      "num_tokens": 11545110.0,
+      "reward": 0.22265625,
+      "reward_std": 0.16942934691905975,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.296875,
+      "rewards/format_reward_func/std": 0.45867621898651123,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2773333333333333,
+      "grad_norm": 56.11950857705648,
+      "kl": 7.319580078125,
+      "learning_rate": 4.851957747496606e-07,
+      "loss": 0.0073,
+      "num_tokens": 11695202.0,
+      "reward": 0.1953125,
+      "reward_std": 0.19627749919891357,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.2421875,
+      "rewards/format_reward_func/std": 0.4300905168056488,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2808888888888889,
+      "grad_norm": 1.1894403579564377,
+      "kl": 0.2705078125,
+      "learning_rate": 4.847033247376605e-07,
+      "loss": 0.0003,
+      "num_tokens": 11845266.0,
+      "reward": 0.16796875,
+      "reward_std": 0.20112939178943634,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.2421875,
+      "rewards/format_reward_func/std": 0.4300905168056488,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.28444444444444444,
+      "grad_norm": 15.58649087673573,
+      "kl": 5.33154296875,
+      "learning_rate": 4.842030759795866e-07,
+      "loss": 0.0053,
+      "num_tokens": 11995326.0,
+      "reward": 0.20703125,
+      "reward_std": 0.19561228156089783,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.2890625,
+      "rewards/format_reward_func/std": 0.45510825514793396,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.288,
+      "grad_norm": 1.2211524541098628,
+      "kl": 0.345703125,
+      "learning_rate": 4.836950450977558e-07,
+      "loss": 0.0003,
+      "num_tokens": 12145330.0,
+      "reward": 0.28515625,
+      "reward_std": 0.2513952851295471,
+      "rewards/equation_reward_func/mean": 0.2265625,
+      "rewards/equation_reward_func/std": 0.4202519655227661,
+      "rewards/format_reward_func/mean": 0.34375,
+      "rewards/format_reward_func/std": 0.47682511806488037,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.29155555555555557,
+      "grad_norm": 11.595035432688796,
+      "kl": 0.9814453125,
+      "learning_rate": 4.831792489730703e-07,
+      "loss": 0.001,
+      "num_tokens": 12295462.0,
+      "reward": 0.171875,
+      "reward_std": 0.16823168098926544,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.1953125,
+      "rewards/format_reward_func/std": 0.3979988098144531,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2951111111111111,
+      "grad_norm": 1.1924550929402917,
+      "kl": 0.3919677734375,
+      "learning_rate": 4.826557047444563e-07,
+      "loss": 0.0004,
+      "num_tokens": 12445498.0,
+      "reward": 0.14453125,
+      "reward_std": 0.18077430129051208,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.1953125,
+      "rewards/format_reward_func/std": 0.3979988098144531,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2986666666666667,
+      "grad_norm": 21.230632420638482,
+      "kl": 3.0062255859375,
+      "learning_rate": 4.821244298082951e-07,
+      "loss": 0.003,
+      "num_tokens": 12595542.0,
+      "reward": 0.22265625,
+      "reward_std": 0.22652746737003326,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.2734375,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3022222222222222,
+      "grad_norm": 1.0369057223186056,
+      "kl": 0.2333984375,
+      "learning_rate": 4.815854418178445e-07,
+      "loss": 0.0002,
+      "num_tokens": 12745614.0,
+      "reward": 0.19140625,
+      "reward_std": 0.20221619307994843,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.2578125,
+      "rewards/format_reward_func/std": 0.43914902210235596,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.30577777777777776,
+      "grad_norm": 1.0278344273119406,
+      "kl": 0.1983642578125,
+      "learning_rate": 4.810387586826527e-07,
+      "loss": 0.0002,
+      "num_tokens": 12895698.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2464390993118286,
+      "rewards/equation_reward_func/mean": 0.203125,
+      "rewards/equation_reward_func/std": 0.40390563011169434,
+      "rewards/format_reward_func/mean": 0.34375,
+      "rewards/format_reward_func/std": 0.47682511806488037,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.30933333333333335,
+      "grad_norm": 9.38166565142255,
+      "kl": 1.9534912109375,
+      "learning_rate": 4.804843985679626e-07,
+      "loss": 0.002,
+      "num_tokens": 13045762.0,
+      "reward": 0.171875,
+      "reward_std": 0.2152000367641449,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.2109375,
+      "rewards/format_reward_func/std": 0.4095771610736847,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3128888888888889,
+      "grad_norm": 1.1768601924784283,
+      "kl": 0.3209228515625,
+      "learning_rate": 4.799223798941089e-07,
+      "loss": 0.0003,
+      "num_tokens": 13195926.0,
+      "reward": 0.16015625,
+      "reward_std": 0.18319149315357208,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.21875,
+      "rewards/format_reward_func/std": 0.41502299904823303,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3164444444444444,
+      "grad_norm": 0.6415947722993769,
+      "kl": 0.1815185546875,
+      "learning_rate": 4.793527213359058e-07,
+      "loss": 0.0002,
+      "num_tokens": 13346058.0,
+      "reward": 0.1015625,
+      "reward_std": 0.1392945945262909,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.3320184051990509,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.32,
+      "grad_norm": 1.1209935354079488,
+      "kl": 0.1925048828125,
+      "learning_rate": 4.787754418220257e-07,
+      "loss": 0.0002,
+      "num_tokens": 13496054.0,
+      "reward": 0.171875,
+      "reward_std": 0.21861067414283752,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.2265625,
+      "rewards/format_reward_func/std": 0.4202519655227661,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.32355555555555554,
+      "grad_norm": 1.1350649820782601,
+      "kl": 0.2437744140625,
+      "learning_rate": 4.781905605343716e-07,
+      "loss": 0.0002,
+      "num_tokens": 13646078.0,
+      "reward": 0.16796875,
+      "reward_std": 0.1717531979084015,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.234375,
+      "rewards/format_reward_func/std": 0.42527204751968384,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.32711111111111113,
+      "grad_norm": 0.8036131940570752,
+      "kl": 0.2301025390625,
+      "learning_rate": 4.775980969074385e-07,
+      "loss": 0.0002,
+      "num_tokens": 13796150.0,
+      "reward": 0.17578125,
+      "reward_std": 0.1618429571390152,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.203125,
+      "rewards/format_reward_func/std": 0.40390563011169434,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.33066666666666666,
+      "grad_norm": 2.5669796312022144,
+      "kl": 0.4324951171875,
+      "learning_rate": 4.769980706276687e-07,
+      "loss": 0.0004,
+      "num_tokens": 13946262.0,
+      "reward": 0.203125,
+      "reward_std": 0.2259906381368637,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.2578125,
+      "rewards/format_reward_func/std": 0.43914902210235596,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3342222222222222,
+      "grad_norm": 1.5566319365843342,
+      "kl": 0.2623291015625,
+      "learning_rate": 4.7639050163279646e-07,
+      "loss": 0.0003,
+      "num_tokens": 14096386.0,
+      "reward": 0.14453125,
+      "reward_std": 0.1585453897714615,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.1796875,
+      "rewards/format_reward_func/std": 0.3854354918003082,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3377777777777778,
+      "grad_norm": 0.9827466599698991,
+      "kl": 0.2091064453125,
+      "learning_rate": 4.757754101111867e-07,
+      "loss": 0.0002,
+      "num_tokens": 14246454.0,
+      "reward": 0.2109375,
+      "reward_std": 0.2073678970336914,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.2734375,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.8474200819506911,
+      "kl": 0.1885986328125,
+      "learning_rate": 4.751528165011633e-07,
+      "loss": 0.0002,
+      "num_tokens": 14396482.0,
+      "reward": 0.1953125,
+      "reward_std": 0.19494709372520447,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.434714138507843,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3448888888888889,
+      "grad_norm": 0.8935139225334848,
+      "kl": 0.19091796875,
+      "learning_rate": 4.7452274149033036e-07,
+      "loss": 0.0002,
+      "num_tokens": 14546506.0,
+      "reward": 0.22265625,
+      "reward_std": 0.1896713674068451,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.2734375,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.34844444444444445,
+      "grad_norm": 0.8908823710522052,
+      "kl": 0.1903076171875,
+      "learning_rate": 4.738852060148848e-07,
+      "loss": 0.0002,
+      "num_tokens": 14696570.0,
+      "reward": 0.16015625,
+      "reward_std": 0.17767438292503357,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.1953125,
+      "rewards/format_reward_func/std": 0.3979988098144531,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.352,
+      "grad_norm": 0.8940407168561494,
+      "kl": 0.15765380859375,
+      "learning_rate": 4.7324023125892067e-07,
+      "loss": 0.0002,
+      "num_tokens": 14846678.0,
+      "reward": 0.13671875,
+      "reward_std": 0.16569268703460693,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.2109375,
+      "rewards/format_reward_func/std": 0.4095771610736847,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.35555555555555557,
+      "grad_norm": 1.0372146373302493,
+      "kl": 0.2196044921875,
+      "learning_rate": 4.7258783865372496e-07,
+      "loss": 0.0002,
+      "num_tokens": 14996674.0,
+      "reward": 0.296875,
+      "reward_std": 0.29111427068710327,
+      "rewards/equation_reward_func/mean": 0.2421875,
+      "rewards/equation_reward_func/std": 0.4300905168056488,
+      "rewards/format_reward_func/mean": 0.3515625,
+      "rewards/format_reward_func/std": 0.4793342351913452,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3591111111111111,
+      "grad_norm": 3.8175886041475904,
+      "kl": 0.525634765625,
+      "learning_rate": 4.719280498770659e-07,
+      "loss": 0.0005,
+      "num_tokens": 15146770.0,
+      "reward": 0.265625,
+      "reward_std": 0.1947406530380249,
+      "rewards/equation_reward_func/mean": 0.234375,
+      "rewards/equation_reward_func/std": 0.42527204751968384,
+      "rewards/format_reward_func/mean": 0.296875,
+      "rewards/format_reward_func/std": 0.45867621898651123,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3626666666666667,
+      "grad_norm": 1.7543264789310198,
+      "kl": 0.295166015625,
+      "learning_rate": 4.712608868524726e-07,
+      "loss": 0.0003,
+      "num_tokens": 15296818.0,
+      "reward": 0.1796875,
+      "reward_std": 0.18956050276756287,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.28125,
+      "rewards/format_reward_func/std": 0.4513758420944214,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3662222222222222,
+      "grad_norm": 1.0227525620642117,
+      "kl": 0.20050048828125,
+      "learning_rate": 4.70586371748506e-07,
+      "loss": 0.0002,
+      "num_tokens": 15446870.0,
+      "reward": 0.2578125,
+      "reward_std": 0.19561007618904114,
+      "rewards/equation_reward_func/mean": 0.21875,
+      "rewards/equation_reward_func/std": 0.41502299904823303,
+      "rewards/format_reward_func/mean": 0.296875,
+      "rewards/format_reward_func/std": 0.45867621898651123,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.36977777777777776,
+      "grad_norm": 0.9640594800874314,
+      "kl": 0.2506103515625,
+      "learning_rate": 4.699045269780232e-07,
+      "loss": 0.0003,
+      "num_tokens": 15596974.0,
+      "reward": 0.25390625,
+      "reward_std": 0.21409572660923004,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.3203125,
+      "rewards/format_reward_func/std": 0.4684300124645233,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.9924489289475691,
+      "kl": 0.281982421875,
+      "learning_rate": 4.692153751974318e-07,
+      "loss": 0.0003,
+      "num_tokens": 15747106.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1833132803440094,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.328125,
+      "rewards/format_reward_func/std": 0.4713755249977112,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3768888888888889,
+      "grad_norm": 1.1958906934430737,
+      "kl": 0.2791748046875,
+      "learning_rate": 4.685189393059377e-07,
+      "loss": 0.0003,
+      "num_tokens": 15897182.0,
+      "reward": 0.28515625,
+      "reward_std": 0.2538124918937683,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.453125,
+      "rewards/format_reward_func/std": 0.4997538626194,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3804444444444444,
+      "grad_norm": 4.660458310479511,
+      "kl": 0.49755859375,
+      "learning_rate": 4.6781524244478374e-07,
+      "loss": 0.0005,
+      "num_tokens": 16047302.0,
+      "reward": 0.31640625,
+      "reward_std": 0.23313136398792267,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.46875,
+      "rewards/format_reward_func/std": 0.5009832978248596,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.384,
+      "grad_norm": 1.2324512257183518,
+      "kl": 0.3973388671875,
+      "learning_rate": 4.6710430799648143e-07,
+      "loss": 0.0004,
+      "num_tokens": 16197302.0,
+      "reward": 0.3125,
+      "reward_std": 0.19484493136405945,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.4765625,
+      "rewards/format_reward_func/std": 0.5014128684997559,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.38755555555555554,
+      "grad_norm": 1.2549179396404038,
+      "kl": 0.2296142578125,
+      "learning_rate": 4.663861595840332e-07,
+      "loss": 0.0002,
+      "num_tokens": 16347306.0,
+      "reward": 0.390625,
+      "reward_std": 0.24787381291389465,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.59375,
+      "rewards/format_reward_func/std": 0.4930621087551117,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.39111111111111113,
+      "grad_norm": 1.078204553596969,
+      "kl": 0.3565673828125,
+      "learning_rate": 4.6566082107014795e-07,
+      "loss": 0.0004,
+      "num_tokens": 16497418.0,
+      "reward": 0.27734375,
+      "reward_std": 0.22730353474617004,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.4375,
+      "rewards/format_reward_func/std": 0.49802759289741516,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.39466666666666667,
+      "grad_norm": 13.095496410143678,
+      "kl": 1.576416015625,
+      "learning_rate": 4.649283165564479e-07,
+      "loss": 0.0016,
+      "num_tokens": 16647462.0,
+      "reward": 0.30859375,
+      "reward_std": 0.19462978839874268,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.4921875,
+      "rewards/format_reward_func/std": 0.5019033551216125,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3982222222222222,
+      "grad_norm": 1.1570771083116458,
+      "kl": 0.3052978515625,
+      "learning_rate": 4.6418867038266807e-07,
+      "loss": 0.0003,
+      "num_tokens": 16797554.0,
+      "reward": 0.3046875,
+      "reward_std": 0.227077454328537,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.46875,
+      "rewards/format_reward_func/std": 0.5009832978248596,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4017777777777778,
+      "grad_norm": 1.2112330269604161,
+      "kl": 0.267578125,
+      "learning_rate": 4.6344190712584713e-07,
+      "loss": 0.0003,
+      "num_tokens": 16947698.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2886773347854614,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.5546875,
+      "rewards/format_reward_func/std": 0.4989531338214874,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4053333333333333,
+      "grad_norm": 1.0237828322135445,
+      "kl": 0.248291015625,
+      "learning_rate": 4.6268805159951086e-07,
+      "loss": 0.0002,
+      "num_tokens": 17097782.0,
+      "reward": 0.33984375,
+      "reward_std": 0.22488634288311005,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.546875,
+      "rewards/format_reward_func/std": 0.4997538626194,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4088888888888889,
+      "grad_norm": 3.3025894103980704,
+      "kl": 0.5584716796875,
+      "learning_rate": 4.619271288528478e-07,
+      "loss": 0.0006,
+      "num_tokens": 17247882.0,
+      "reward": 0.27734375,
+      "reward_std": 0.2406156361103058,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.453125,
+      "rewards/format_reward_func/std": 0.4997538626194,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.41244444444444445,
+      "grad_norm": 4.1382469339470225,
+      "kl": 0.8095703125,
+      "learning_rate": 4.611591641698768e-07,
+      "loss": 0.0008,
+      "num_tokens": 17397994.0,
+      "reward": 0.28125,
+      "reward_std": 0.1947515904903412,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.46875,
+      "rewards/format_reward_func/std": 0.5009832978248596,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.416,
+      "grad_norm": 15.502900105571765,
+      "kl": 3.479736328125,
+      "learning_rate": 4.6038418306860695e-07,
+      "loss": 0.0035,
+      "num_tokens": 17548122.0,
+      "reward": 0.33203125,
+      "reward_std": 0.2296164482831955,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.5625,
+      "rewards/format_reward_func/std": 0.49802759289741516,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.41955555555555557,
+      "grad_norm": 1.2758988778808227,
+      "kl": 0.275390625,
+      "learning_rate": 4.596022113001894e-07,
+      "loss": 0.0003,
+      "num_tokens": 17698266.0,
+      "reward": 0.375,
+      "reward_std": 0.203322634100914,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.5625,
+      "rewards/format_reward_func/std": 0.49802759289741516,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4231111111111111,
+      "grad_norm": 1.1770866594193163,
+      "kl": 0.3016357421875,
+      "learning_rate": 4.58813274848062e-07,
+      "loss": 0.0003,
+      "num_tokens": 17848314.0,
+      "reward": 0.37890625,
+      "reward_std": 0.23390743136405945,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.5625,
+      "rewards/format_reward_func/std": 0.49802759289741516,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 2.4545856054451156,
+      "kl": 0.7242431640625,
+      "learning_rate": 4.5801739992708604e-07,
+      "loss": 0.0007,
+      "num_tokens": 17998410.0,
+      "reward": 0.31640625,
+      "reward_std": 0.208717942237854,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.5078125,
+      "rewards/format_reward_func/std": 0.5019033551216125,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.43022222222222223,
+      "grad_norm": 1.4734406735109156,
+      "kl": 0.3726806640625,
+      "learning_rate": 4.572146129826746e-07,
+      "loss": 0.0004,
+      "num_tokens": 18148458.0,
+      "reward": 0.46875,
+      "reward_std": 0.23896577954292297,
+      "rewards/equation_reward_func/mean": 0.2421875,
+      "rewards/equation_reward_func/std": 0.4300905168056488,
+      "rewards/format_reward_func/mean": 0.6953125,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.43377777777777776,
+      "grad_norm": 5.758478512827765,
+      "kl": 1.009033203125,
+      "learning_rate": 4.5640494068991454e-07,
+      "loss": 0.001,
+      "num_tokens": 18298598.0,
+      "reward": 0.32421875,
+      "reward_std": 0.20069028437137604,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.5234375,
+      "rewards/format_reward_func/std": 0.5014128684997559,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.43733333333333335,
+      "grad_norm": 1.1248450015614464,
+      "kl": 0.288330078125,
+      "learning_rate": 4.555884099526793e-07,
+      "loss": 0.0003,
+      "num_tokens": 18448610.0,
+      "reward": 0.40234375,
+      "reward_std": 0.20794187486171722,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.6171875,
+      "rewards/format_reward_func/std": 0.4879830479621887,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4408888888888889,
+      "grad_norm": 1.117208300016219,
+      "kl": 0.271728515625,
+      "learning_rate": 4.547650479027361e-07,
+      "loss": 0.0003,
+      "num_tokens": 18598694.0,
+      "reward": 0.41015625,
+      "reward_std": 0.20739847421646118,
+      "rewards/equation_reward_func/mean": 0.2109375,
+      "rewards/equation_reward_func/std": 0.4095771610736847,
+      "rewards/format_reward_func/mean": 0.609375,
+      "rewards/format_reward_func/std": 0.4898075461387634,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.212716720763158,
+      "kl": 0.28369140625,
+      "learning_rate": 4.53934881898843e-07,
+      "loss": 0.0003,
+      "num_tokens": 18748774.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2037726789712906,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.625,
+      "rewards/format_reward_func/std": 0.4860251843929291,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.448,
+      "grad_norm": 1.4912885010455994,
+      "kl": 0.28466796875,
+      "learning_rate": 4.5309793952584095e-07,
+      "loss": 0.0003,
+      "num_tokens": 18898894.0,
+      "reward": 0.30078125,
+      "reward_std": 0.2230125367641449,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.4921875,
+      "rewards/format_reward_func/std": 0.5019033551216125,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.45155555555555554,
+      "grad_norm": 1.6085493649140328,
+      "kl": 0.369873046875,
+      "learning_rate": 4.5225424859373684e-07,
+      "loss": 0.0004,
+      "num_tokens": 19048970.0,
+      "reward": 0.35546875,
+      "reward_std": 0.21158519387245178,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.625,
+      "rewards/format_reward_func/std": 0.4860251843929291,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.45511111111111113,
+      "grad_norm": 8.1136624800545,
+      "kl": 1.5572509765625,
+      "learning_rate": 4.514038371367791e-07,
+      "loss": 0.0016,
+      "num_tokens": 19199078.0,
+      "reward": 0.42578125,
+      "reward_std": 0.19231687486171722,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.6953125,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.45866666666666667,
+      "grad_norm": 1.4312492558258578,
+      "kl": 0.3763427734375,
+      "learning_rate": 4.5054673341252657e-07,
+      "loss": 0.0004,
+      "num_tokens": 19349186.0,
+      "reward": 0.34765625,
+      "reward_std": 0.1864890456199646,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.5859375,
+      "rewards/format_reward_func/std": 0.49449479579925537,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4622222222222222,
+      "grad_norm": 2.962105238864389,
+      "kl": 0.706787109375,
+      "learning_rate": 4.496829659009095e-07,
+      "loss": 0.0007,
+      "num_tokens": 19499346.0,
+      "reward": 0.34765625,
+      "reward_std": 0.18802587687969208,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.59375,
+      "rewards/format_reward_func/std": 0.4930621087551117,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4657777777777778,
+      "grad_norm": 354.1491510027472,
+      "kl": 56.3818359375,
+      "learning_rate": 4.488125633032831e-07,
+      "loss": 0.0565,
+      "num_tokens": 19649510.0,
+      "reward": 0.33984375,
+      "reward_std": 0.21212857961654663,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.578125,
+      "rewards/format_reward_func/std": 0.4957992732524872,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4693333333333333,
+      "grad_norm": 174.51099175867512,
+      "kl": 9.54150390625,
+      "learning_rate": 4.479355545414738e-07,
+      "loss": 0.0096,
+      "num_tokens": 19799590.0,
+      "reward": 0.41796875,
+      "reward_std": 0.19462977349758148,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.6640625,
+      "rewards/format_reward_func/std": 0.47417303919792175,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4728888888888889,
+      "grad_norm": 1.397870689490664,
+      "kl": 0.516845703125,
+      "learning_rate": 4.470519687568185e-07,
+      "loss": 0.0005,
+      "num_tokens": 19949690.0,
+      "reward": 0.38671875,
+      "reward_std": 0.18506528437137604,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.609375,
+      "rewards/format_reward_func/std": 0.4898075461387634,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.47644444444444445,
+      "grad_norm": 1.0641630797189836,
+      "kl": 0.30322265625,
+      "learning_rate": 4.4616183530919604e-07,
+      "loss": 0.0003,
+      "num_tokens": 20099654.0,
+      "reward": 0.4609375,
+      "reward_std": 0.17670938372612,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.734375,
+      "rewards/format_reward_func/std": 0.44340085983276367,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.48,
+      "grad_norm": 6.158216503528497,
+      "kl": 1.213623046875,
+      "learning_rate": 4.452651837760515e-07,
+      "loss": 0.0012,
+      "num_tokens": 20249794.0,
+      "reward": 0.42578125,
+      "reward_std": 0.17373128235340118,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.65625,
+      "rewards/format_reward_func/std": 0.47682511806488037,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.48355555555555557,
+      "grad_norm": 1.3444116770940566,
+      "kl": 0.523681640625,
+      "learning_rate": 4.443620439514138e-07,
+      "loss": 0.0005,
+      "num_tokens": 20399882.0,
+      "reward": 0.3984375,
+      "reward_std": 0.16162778437137604,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.6953125,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4871111111111111,
+      "grad_norm": 1.531233557438578,
+      "kl": 0.55908203125,
+      "learning_rate": 4.4345244584490535e-07,
+      "loss": 0.0006,
+      "num_tokens": 20550006.0,
+      "reward": 0.48046875,
+      "reward_std": 0.21245458722114563,
+      "rewards/equation_reward_func/mean": 0.28125,
+      "rewards/equation_reward_func/std": 0.4513758420944214,
+      "rewards/format_reward_func/mean": 0.6796875,
+      "rewards/format_reward_func/std": 0.4684300124645233,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.49066666666666664,
+      "grad_norm": 7.99434662228326,
+      "kl": 1.595458984375,
+      "learning_rate": 4.4253641968074505e-07,
+      "loss": 0.0016,
+      "num_tokens": 20700002.0,
+      "reward": 0.43359375,
+      "reward_std": 0.20639410614967346,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.671875,
+      "rewards/format_reward_func/std": 0.4713755249977112,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.49422222222222223,
+      "grad_norm": 3.051331037407785,
+      "kl": 0.939697265625,
+      "learning_rate": 4.41613995896744e-07,
+      "loss": 0.0009,
+      "num_tokens": 20850102.0,
+      "reward": 0.3671875,
+      "reward_std": 0.19056487083435059,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.640625,
+      "rewards/format_reward_func/std": 0.481702595949173,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.49777777777777776,
+      "grad_norm": 4.814397537365208,
+      "kl": 1.9078369140625,
+      "learning_rate": 4.40685205143294e-07,
+      "loss": 0.0019,
+      "num_tokens": 21000234.0,
+      "reward": 0.375,
+      "reward_std": 0.1520632803440094,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.6171875,
+      "rewards/format_reward_func/std": 0.4879830479621887,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5013333333333333,
+      "grad_norm": 3.2517027439890924,
+      "kl": 0.801025390625,
+      "learning_rate": 4.3975007828234914e-07,
+      "loss": 0.0008,
+      "num_tokens": 21150326.0,
+      "reward": 0.43359375,
+      "reward_std": 0.2021140456199646,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.703125,
+      "rewards/format_reward_func/std": 0.45867621898651123,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5048888888888889,
+      "grad_norm": 1.663987879451297,
+      "kl": 0.525634765625,
+      "learning_rate": 4.3880864638640035e-07,
+      "loss": 0.0005,
+      "num_tokens": 21300382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.17912657558918,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.7421875,
+      "rewards/format_reward_func/std": 0.43914902210235596,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5084444444444445,
+      "grad_norm": 1.6351203503838305,
+      "kl": 0.519287109375,
+      "learning_rate": 4.37860940737443e-07,
+      "loss": 0.0005,
+      "num_tokens": 21450566.0,
+      "reward": 0.26171875,
+      "reward_std": 0.21883678436279297,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.5078125,
+      "rewards/format_reward_func/std": 0.5019033551216125,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.512,
+      "grad_norm": 1.366258610631795,
+      "kl": 0.574462890625,
+      "learning_rate": 4.3690699282593723e-07,
+      "loss": 0.0006,
+      "num_tokens": 21600694.0,
+      "reward": 0.4140625,
+      "reward_std": 0.22181487083435059,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.671875,
+      "rewards/format_reward_func/std": 0.4713755249977112,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5155555555555555,
+      "grad_norm": 1.2979528663337039,
+      "kl": 0.371826171875,
+      "learning_rate": 4.3594683434976186e-07,
+      "loss": 0.0004,
+      "num_tokens": 21750818.0,
+      "reward": 0.41015625,
+      "reward_std": 0.23919188976287842,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.6328125,
+      "rewards/format_reward_func/std": 0.4839322865009308,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5191111111111111,
+      "grad_norm": 13.971745146487256,
+      "kl": 2.259521484375,
+      "learning_rate": 4.3498049721316087e-07,
+      "loss": 0.0023,
+      "num_tokens": 21900886.0,
+      "reward": 0.48828125,
+      "reward_std": 0.18275237083435059,
+      "rewards/equation_reward_func/mean": 0.28125,
+      "rewards/equation_reward_func/std": 0.4513758420944214,
+      "rewards/format_reward_func/mean": 0.6953125,
+      "rewards/format_reward_func/std": 0.46208351850509644,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5226666666666666,
+      "grad_norm": 1.5151109592881105,
+      "kl": 0.50927734375,
+      "learning_rate": 4.340080135256835e-07,
+      "loss": 0.0005,
+      "num_tokens": 22050994.0,
+      "reward": 0.375,
+      "reward_std": 0.1720726490020752,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.609375,
+      "rewards/format_reward_func/std": 0.4898075461387634,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5262222222222223,
+      "grad_norm": 3.565613250452234,
+      "kl": 1.1435546875,
+      "learning_rate": 4.3302941560111716e-07,
+      "loss": 0.0011,
+      "num_tokens": 22201058.0,
+      "reward": 0.44140625,
+      "reward_std": 0.1594257354736328,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.6875,
+      "rewards/format_reward_func/std": 0.4653336703777313,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5297777777777778,
+      "grad_norm": 1.1082031850469054,
+      "kl": 0.389892578125,
+      "learning_rate": 4.3204473595641367e-07,
+      "loss": 0.0004,
+      "num_tokens": 22351074.0,
+      "reward": 0.5,
+      "reward_std": 0.17966999113559723,
+      "rewards/equation_reward_func/mean": 0.234375,
+      "rewards/equation_reward_func/std": 0.42527204751968384,
+      "rewards/format_reward_func/mean": 0.765625,
+      "rewards/format_reward_func/std": 0.42527204751968384,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 29.666704145228632,
+      "kl": 5.103271484375,
+      "learning_rate": 4.3105400731060896e-07,
+      "loss": 0.0051,
+      "num_tokens": 22501194.0,
+      "reward": 0.44140625,
+      "reward_std": 0.161842942237854,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.75,
+      "rewards/format_reward_func/std": 0.434714138507843,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5368888888888889,
+      "grad_norm": 72.53647797594527,
+      "kl": 2.019775390625,
+      "learning_rate": 4.300572625837359e-07,
+      "loss": 0.002,
+      "num_tokens": 22651314.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20705929398536682,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.7109375,
+      "rewards/format_reward_func/std": 0.45510825514793396,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5404444444444444,
+      "grad_norm": 1.1753775278093777,
+      "kl": 0.44921875,
+      "learning_rate": 4.2905453489573007e-07,
+      "loss": 0.0004,
+      "num_tokens": 22801386.0,
+      "reward": 0.41015625,
+      "reward_std": 0.15327188372612,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.7265625,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.544,
+      "grad_norm": 1.8103430830421683,
+      "kl": 0.861328125,
+      "learning_rate": 4.280458575653296e-07,
+      "loss": 0.0009,
+      "num_tokens": 22951478.0,
+      "reward": 0.4375,
+      "reward_std": 0.16887937486171722,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.7265625,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5475555555555556,
+      "grad_norm": 92.17653023102632,
+      "kl": 29.57275390625,
+      "learning_rate": 4.2703126410896815e-07,
+      "loss": 0.0296,
+      "num_tokens": 23101526.0,
+      "reward": 0.421875,
+      "reward_std": 0.2033226490020752,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.7265625,
+      "rewards/format_reward_func/std": 0.447474867105484,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5511111111111111,
+      "grad_norm": 1.8601363913983529,
+      "kl": 0.711669921875,
+      "learning_rate": 4.2601078823966065e-07,
+      "loss": 0.0007,
+      "num_tokens": 23251622.0,
+      "reward": 0.3984375,
+      "reward_std": 0.23224879801273346,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.671875,
+      "rewards/format_reward_func/std": 0.4713755249977112,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5546666666666666,
+      "grad_norm": 1.9655716755995285,
+      "kl": 0.5400390625,
+      "learning_rate": 4.249844638658837e-07,
+      "loss": 0.0005,
+      "num_tokens": 23401694.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1894671618938446,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.71875,
+      "rewards/format_reward_func/std": 0.4513758420944214,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5582222222222222,
+      "grad_norm": 1.2512804150325183,
+      "kl": 0.381103515625,
+      "learning_rate": 4.2395232509044856e-07,
+      "loss": 0.0004,
+      "num_tokens": 23551754.0,
+      "reward": 0.4140625,
+      "reward_std": 0.15502388775348663,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.78125,
+      "rewards/format_reward_func/std": 0.41502299904823303,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5617777777777778,
+      "grad_norm": 4.6864178808609065,
+      "kl": 1.0859375,
+      "learning_rate": 4.229144062093679e-07,
+      "loss": 0.0011,
+      "num_tokens": 23701886.0,
+      "reward": 0.40234375,
+      "reward_std": 0.17801132798194885,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.6796875,
+      "rewards/format_reward_func/std": 0.4684300124645233,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5653333333333334,
+      "grad_norm": 1.2185679157403768,
+      "kl": 0.41064453125,
+      "learning_rate": 4.218707417107166e-07,
+      "loss": 0.0004,
+      "num_tokens": 23852050.0,
+      "reward": 0.37109375,
+      "reward_std": 0.1713140904903412,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.6796875,
+      "rewards/format_reward_func/std": 0.4684300124645233,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5688888888888889,
+      "grad_norm": 1.248032189193752,
+      "kl": 0.443359375,
+      "learning_rate": 4.208213662734852e-07,
+      "loss": 0.0004,
+      "num_tokens": 24002034.0,
+      "reward": 0.48828125,
+      "reward_std": 0.14753741025924683,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.859375,
+      "rewards/format_reward_func/std": 0.3490002751350403,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5724444444444444,
+      "grad_norm": 1.0325417882527244,
+      "kl": 0.435546875,
+      "learning_rate": 4.197663147664281e-07,
+      "loss": 0.0004,
+      "num_tokens": 24152122.0,
+      "reward": 0.44140625,
+      "reward_std": 0.1556890904903412,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.828125,
+      "rewards/format_reward_func/std": 0.3787541687488556,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.576,
+      "grad_norm": 1.1687197449448241,
+      "kl": 0.60693359375,
+      "learning_rate": 4.187056222469046e-07,
+      "loss": 0.0006,
+      "num_tokens": 24302178.0,
+      "reward": 0.5,
+      "reward_std": 0.18526950478553772,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.8515625,
+      "rewards/format_reward_func/std": 0.356930136680603,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5795555555555556,
+      "grad_norm": 0.8560481833926494,
+      "kl": 0.405029296875,
+      "learning_rate": 4.1763932395971433e-07,
+      "loss": 0.0004,
+      "num_tokens": 24452246.0,
+      "reward": 0.51953125,
+      "reward_std": 0.08835469186306,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5831111111111111,
+      "grad_norm": 0.9227050375160734,
+      "kl": 0.361572265625,
+      "learning_rate": 4.1656745533592565e-07,
+      "loss": 0.0004,
+      "num_tokens": 24602314.0,
+      "reward": 0.51953125,
+      "reward_std": 0.112550750374794,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5866666666666667,
+      "grad_norm": 2.9673521250500965,
+      "kl": 0.835693359375,
+      "learning_rate": 4.1549005199169887e-07,
+      "loss": 0.0008,
+      "num_tokens": 24752402.0,
+      "reward": 0.48828125,
+      "reward_std": 0.125758558511734,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.875,
+      "rewards/format_reward_func/std": 0.3320184051990509,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5902222222222222,
+      "grad_norm": 59803.04638668722,
+      "kl": 5376.424560546875,
+      "learning_rate": 4.1440714972710245e-07,
+      "loss": 5.3755,
+      "num_tokens": 24902430.0,
+      "reward": 0.50390625,
+      "reward_std": 0.11067695170640945,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.875,
+      "rewards/format_reward_func/std": 0.3320184051990509,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5937777777777777,
+      "grad_norm": 0.9872798550394157,
+      "kl": 0.467041015625,
+      "learning_rate": 4.1331878452492366e-07,
+      "loss": 0.0005,
+      "num_tokens": 25052534.0,
+      "reward": 0.53125,
+      "reward_std": 0.135988250374794,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.9968829039320042,
+      "kl": 0.5625,
+      "learning_rate": 4.122249925494726e-07,
+      "loss": 0.0006,
+      "num_tokens": 25202638.0,
+      "reward": 0.546875,
+      "reward_std": 0.09616719186306,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6008888888888889,
+      "grad_norm": 1.407994078507014,
+      "kl": 0.69482421875,
+      "learning_rate": 4.111258101453809e-07,
+      "loss": 0.0007,
+      "num_tokens": 25352698.0,
+      "reward": 0.47265625,
+      "reward_std": 0.07933359593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.8828125,
+      "rewards/format_reward_func/std": 0.322907418012619,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6044444444444445,
+      "grad_norm": 0.9550324441837219,
+      "kl": 0.371337890625,
+      "learning_rate": 4.10021273836394e-07,
+      "loss": 0.0004,
+      "num_tokens": 25502766.0,
+      "reward": 0.47265625,
+      "reward_std": 0.09077189117670059,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.608,
+      "grad_norm": 0.9530508768607797,
+      "kl": 0.538818359375,
+      "learning_rate": 4.0891142032415717e-07,
+      "loss": 0.0005,
+      "num_tokens": 25652830.0,
+      "reward": 0.4765625,
+      "reward_std": 0.0739382952451706,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.90625,
+      "rewards/format_reward_func/std": 0.29262590408325195,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6115555555555555,
+      "grad_norm": 1.3515658288339731,
+      "kl": 0.4130859375,
+      "learning_rate": 4.0779628648699647e-07,
+      "loss": 0.0004,
+      "num_tokens": 25802866.0,
+      "reward": 0.46484375,
+      "reward_std": 0.11960469186306,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.8359375,
+      "rewards/format_reward_func/std": 0.371787428855896,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6151111111111112,
+      "grad_norm": 5.062381594275897,
+      "kl": 1.2841796875,
+      "learning_rate": 4.066759093786931e-07,
+      "loss": 0.0013,
+      "num_tokens": 25952918.0,
+      "reward": 0.5703125,
+      "reward_std": 0.08054219186306,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6186666666666667,
+      "grad_norm": 1.884658172401711,
+      "kl": 0.83984375,
+      "learning_rate": 4.055503262272521e-07,
+      "loss": 0.0008,
+      "num_tokens": 26103022.0,
+      "reward": 0.5,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.899477885337483,
+      "kl": 0.36328125,
+      "learning_rate": 4.044195744336656e-07,
+      "loss": 0.0004,
+      "num_tokens": 26253170.0,
+      "reward": 0.50390625,
+      "reward_std": 0.1130007952451706,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.8828125,
+      "rewards/format_reward_func/std": 0.322907418012619,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6257777777777778,
+      "grad_norm": 1.1287658852126066,
+      "kl": 0.556884765625,
+      "learning_rate": 4.0328369157066975e-07,
+      "loss": 0.0006,
+      "num_tokens": 26403294.0,
+      "reward": 0.5,
+      "reward_std": 0.10277109593153,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.9554300863560559,
+      "kl": 0.439453125,
+      "learning_rate": 4.021427153814965e-07,
+      "loss": 0.0004,
+      "num_tokens": 26553438.0,
+      "reward": 0.49609375,
+      "reward_std": 0.0859375,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6328888888888888,
+      "grad_norm": 1.8696085163129879,
+      "kl": 0.6962890625,
+      "learning_rate": 4.009966837786194e-07,
+      "loss": 0.0007,
+      "num_tokens": 26703546.0,
+      "reward": 0.45703125,
+      "reward_std": 0.12014809250831604,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.8515625,
+      "rewards/format_reward_func/std": 0.356930136680603,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6364444444444445,
+      "grad_norm": 0.8445236924167039,
+      "kl": 0.44580078125,
+      "learning_rate": 3.9984563484249355e-07,
+      "loss": 0.0004,
+      "num_tokens": 26853626.0,
+      "reward": 0.46484375,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.9296875,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.64,
+      "grad_norm": 8.514060274236515,
+      "kl": 1.341064453125,
+      "learning_rate": 3.98689606820291e-07,
+      "loss": 0.0013,
+      "num_tokens": 27003610.0,
+      "reward": 0.52734375,
+      "reward_std": 0.07933359593153,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6435555555555555,
+      "grad_norm": 1.0918541430694348,
+      "kl": 0.39501953125,
+      "learning_rate": 3.975286381246288e-07,
+      "loss": 0.0004,
+      "num_tokens": 27153798.0,
+      "reward": 0.4609375,
+      "reward_std": 0.08714609593153,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.90625,
+      "rewards/format_reward_func/std": 0.29262590408325195,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6471111111111111,
+      "grad_norm": 0.6639080655090452,
+      "kl": 0.39892578125,
+      "learning_rate": 3.963627673322936e-07,
+      "loss": 0.0004,
+      "num_tokens": 27303818.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0703125,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.8008104294625259,
+      "kl": 0.34619140625,
+      "learning_rate": 3.951920331829592e-07,
+      "loss": 0.0003,
+      "num_tokens": 27453838.0,
+      "reward": 0.53125,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6542222222222223,
+      "grad_norm": 2.4570583505386767,
+      "kl": 0.77587890625,
+      "learning_rate": 3.9401647457789977e-07,
+      "loss": 0.0008,
+      "num_tokens": 27603914.0,
+      "reward": 0.49609375,
+      "reward_std": 0.07933359593153,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9140625,
+      "rewards/format_reward_func/std": 0.2813730239868164,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6577777777777778,
+      "grad_norm": 0.7694633251007069,
+      "kl": 0.36328125,
+      "learning_rate": 3.9283613057869683e-07,
+      "loss": 0.0004,
+      "num_tokens": 27754050.0,
+      "reward": 0.5,
+      "reward_std": 0.0739382952451706,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6613333333333333,
+      "grad_norm": 49.68125956240344,
+      "kl": 6.21240234375,
+      "learning_rate": 3.9165104040594144e-07,
+      "loss": 0.0062,
+      "num_tokens": 27904122.0,
+      "reward": 0.48828125,
+      "reward_std": 0.11541798710823059,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.875,
+      "rewards/format_reward_func/std": 0.3320184051990509,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6648888888888889,
+      "grad_norm": 1.597010810734867,
+      "kl": 0.74072265625,
+      "learning_rate": 3.9046124343793104e-07,
+      "loss": 0.0007,
+      "num_tokens": 28054230.0,
+      "reward": 0.50390625,
+      "reward_std": 0.08835469186306,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6684444444444444,
+      "grad_norm": 0.8478903871951797,
+      "kl": 0.490966796875,
+      "learning_rate": 3.8926677920936093e-07,
+      "loss": 0.0005,
+      "num_tokens": 28204330.0,
+      "reward": 0.53125,
+      "reward_std": 0.09858439117670059,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.672,
+      "grad_norm": 0.7803466603833829,
+      "kl": 0.361328125,
+      "learning_rate": 3.880676874100106e-07,
+      "loss": 0.0004,
+      "num_tokens": 28354446.0,
+      "reward": 0.56640625,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6755555555555556,
+      "grad_norm": 0.6667441813180046,
+      "kl": 0.370361328125,
+      "learning_rate": 3.868640078834251e-07,
+      "loss": 0.0004,
+      "num_tokens": 28504506.0,
+      "reward": 0.53515625,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6791111111111111,
+      "grad_norm": 0.9717479349571527,
+      "kl": 0.352783203125,
+      "learning_rate": 3.856557806255907e-07,
+      "loss": 0.0004,
+      "num_tokens": 28654702.0,
+      "reward": 0.50390625,
+      "reward_std": 0.11058359593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9140625,
+      "rewards/format_reward_func/std": 0.2813730239868164,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.6531105470033437,
+      "kl": 0.4169921875,
+      "learning_rate": 3.844430457836064e-07,
+      "loss": 0.0004,
+      "num_tokens": 28804750.0,
+      "reward": 0.53125,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6862222222222222,
+      "grad_norm": 0.7581030751348999,
+      "kl": 0.4326171875,
+      "learning_rate": 3.8322584365434934e-07,
+      "loss": 0.0004,
+      "num_tokens": 28954890.0,
+      "reward": 0.53515625,
+      "reward_std": 0.08548745512962341,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.9140625,
+      "rewards/format_reward_func/std": 0.2813730239868164,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6897777777777778,
+      "grad_norm": 0.8123280487022461,
+      "kl": 0.378173828125,
+      "learning_rate": 3.8200421468313646e-07,
+      "loss": 0.0004,
+      "num_tokens": 29104990.0,
+      "reward": 0.515625,
+      "reward_std": 0.07767495512962341,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.9557167275680298,
+      "kl": 0.458251953125,
+      "learning_rate": 3.807781994623802e-07,
+      "loss": 0.0005,
+      "num_tokens": 29255106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.09616719186306,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.8984375,
+      "rewards/format_reward_func/std": 0.3032590448856354,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6968888888888889,
+      "grad_norm": 20.914129417208414,
+      "kl": 2.05908203125,
+      "learning_rate": 3.7954783873023946e-07,
+      "loss": 0.0021,
+      "num_tokens": 29405238.0,
+      "reward": 0.51171875,
+      "reward_std": 0.08416798710823059,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9296875,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7004444444444444,
+      "grad_norm": 0.8290581326894807,
+      "kl": 0.460693359375,
+      "learning_rate": 3.7831317336926674e-07,
+      "loss": 0.0005,
+      "num_tokens": 29555306.0,
+      "reward": 0.53125,
+      "reward_std": 0.0739382952451706,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.704,
+      "grad_norm": 0.7450940079039334,
+      "kl": 0.443115234375,
+      "learning_rate": 3.7707424440504863e-07,
+      "loss": 0.0004,
+      "num_tokens": 29705410.0,
+      "reward": 0.48046875,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7075555555555556,
+      "grad_norm": 0.7205252163706303,
+      "kl": 0.453125,
+      "learning_rate": 3.758310930048436e-07,
+      "loss": 0.0005,
+      "num_tokens": 29855442.0,
+      "reward": 0.53125,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7111111111111111,
+      "grad_norm": 1.0214742583016907,
+      "kl": 0.45263671875,
+      "learning_rate": 3.7458376047621356e-07,
+      "loss": 0.0005,
+      "num_tokens": 30005478.0,
+      "reward": 0.55078125,
+      "reward_std": 0.12443909049034119,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.9296875,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7146666666666667,
+      "grad_norm": 4.352162912273971,
+      "kl": 1.650634765625,
+      "learning_rate": 3.733322882656511e-07,
+      "loss": 0.0016,
+      "num_tokens": 30155550.0,
+      "reward": 0.515625,
+      "reward_std": 0.08427885174751282,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9296875,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7182222222222222,
+      "grad_norm": 0.8642171883878098,
+      "kl": 0.646728515625,
+      "learning_rate": 3.7207671795720296e-07,
+      "loss": 0.0006,
+      "num_tokens": 30305614.0,
+      "reward": 0.53125,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7217777777777777,
+      "grad_norm": 0.9770611026209789,
+      "kl": 0.48193359375,
+      "learning_rate": 3.7081709127108767e-07,
+      "loss": 0.0005,
+      "num_tokens": 30455722.0,
+      "reward": 0.50390625,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.9670302361571421,
+      "kl": 0.5126953125,
+      "learning_rate": 3.695534500623096e-07,
+      "loss": 0.0005,
+      "num_tokens": 30605826.0,
+      "reward": 0.48828125,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7288888888888889,
+      "grad_norm": 3.23663467525944,
+      "kl": 0.73828125,
+      "learning_rate": 3.68285836319268e-07,
+      "loss": 0.0007,
+      "num_tokens": 30755898.0,
+      "reward": 0.49609375,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7324444444444445,
+      "grad_norm": 6.934307670841274,
+      "kl": 0.99365234375,
+      "learning_rate": 3.6701429216236204e-07,
+      "loss": 0.001,
+      "num_tokens": 30905906.0,
+      "reward": 0.5703125,
+      "reward_std": 0.08054219186306,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.736,
+      "grad_norm": 0.8933825229974401,
+      "kl": 0.4169921875,
+      "learning_rate": 3.657388598425908e-07,
+      "loss": 0.0004,
+      "num_tokens": 31055970.0,
+      "reward": 0.5390625,
+      "reward_std": 0.10892495512962341,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7395555555555555,
+      "grad_norm": 0.7876520940886738,
+      "kl": 0.554931640625,
+      "learning_rate": 3.644595817401501e-07,
+      "loss": 0.0006,
+      "num_tokens": 31206030.0,
+      "reward": 0.5390625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7431111111111111,
+      "grad_norm": 0.30620401262295277,
+      "kl": 0.384521484375,
+      "learning_rate": 3.631765003630233e-07,
+      "loss": 0.0004,
+      "num_tokens": 31356098.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.565341568697795,
+      "kl": 0.37255859375,
+      "learning_rate": 3.6188965834556964e-07,
+      "loss": 0.0004,
+      "num_tokens": 31506174.0,
+      "reward": 0.54296875,
+      "reward_std": 0.041479695588350296,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7502222222222222,
+      "grad_norm": 0.7183038048403314,
+      "kl": 0.530029296875,
+      "learning_rate": 3.605990984471073e-07,
+      "loss": 0.0005,
+      "num_tokens": 31656230.0,
+      "reward": 0.53515625,
+      "reward_std": 0.05050079524517059,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7537777777777778,
+      "grad_norm": 0.9592998793501326,
+      "kl": 0.6806640625,
+      "learning_rate": 3.5930486355049254e-07,
+      "loss": 0.0007,
+      "num_tokens": 31806246.0,
+      "reward": 0.5625,
+      "reward_std": 0.09616719186306,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.5967010195878546,
+      "kl": 0.388671875,
+      "learning_rate": 3.580069966606949e-07,
+      "loss": 0.0004,
+      "num_tokens": 31956354.0,
+      "reward": 0.52734375,
+      "reward_std": 0.045216359198093414,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7608888888888888,
+      "grad_norm": 0.5984534897529947,
+      "kl": 0.3974609375,
+      "learning_rate": 3.5670554090336804e-07,
+      "loss": 0.0004,
+      "num_tokens": 32106454.0,
+      "reward": 0.515625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7644444444444445,
+      "grad_norm": 1.527700749087867,
+      "kl": 0.652587890625,
+      "learning_rate": 3.55400539523417e-07,
+      "loss": 0.0007,
+      "num_tokens": 32256526.0,
+      "reward": 0.53125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.768,
+      "grad_norm": 0.7168370595424849,
+      "kl": 0.469970703125,
+      "learning_rate": 3.5409203588356096e-07,
+      "loss": 0.0005,
+      "num_tokens": 32406606.0,
+      "reward": 0.546875,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7715555555555556,
+      "grad_norm": 0.6937218726790086,
+      "kl": 0.48388671875,
+      "learning_rate": 3.527800734628927e-07,
+      "loss": 0.0005,
+      "num_tokens": 32556662.0,
+      "reward": 0.55859375,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7751111111111111,
+      "grad_norm": 0.5625106257141403,
+      "kl": 0.36279296875,
+      "learning_rate": 3.5146469585543386e-07,
+      "loss": 0.0004,
+      "num_tokens": 32706774.0,
+      "reward": 0.546875,
+      "reward_std": 0.05170939117670059,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7786666666666666,
+      "grad_norm": 1.8811641758087445,
+      "kl": 0.4970703125,
+      "learning_rate": 3.501459467686859e-07,
+      "loss": 0.0005,
+      "num_tokens": 32856802.0,
+      "reward": 0.5390625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7822222222222223,
+      "grad_norm": 0.717878846553723,
+      "kl": 0.44091796875,
+      "learning_rate": 3.4882387002217837e-07,
+      "loss": 0.0004,
+      "num_tokens": 33006962.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0703125,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7857777777777778,
+      "grad_norm": 4.888546951880972,
+      "kl": 1.05029296875,
+      "learning_rate": 3.474985095460127e-07,
+      "loss": 0.0011,
+      "num_tokens": 33157078.0,
+      "reward": 0.5625,
+      "reward_std": 0.07152109593153,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.9668364386663721,
+      "kl": 0.631591796875,
+      "learning_rate": 3.4616990937940207e-07,
+      "loss": 0.0006,
+      "num_tokens": 33307122.0,
+      "reward": 0.515625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7928888888888889,
+      "grad_norm": 0.5721228641007269,
+      "kl": 0.40625,
+      "learning_rate": 3.448381136692089e-07,
+      "loss": 0.0004,
+      "num_tokens": 33457150.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7964444444444444,
+      "grad_norm": 0.7641757061343398,
+      "kl": 0.369384765625,
+      "learning_rate": 3.435031666684771e-07,
+      "loss": 0.0004,
+      "num_tokens": 33607274.0,
+      "reward": 0.48828125,
+      "reward_std": 0.06986245512962341,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8,
+      "grad_norm": 0.676066907329912,
+      "kl": 0.421630859375,
+      "learning_rate": 3.421651127349622e-07,
+      "loss": 0.0004,
+      "num_tokens": 33757358.0,
+      "reward": 0.53125,
+      "reward_std": 0.04929219186306,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8035555555555556,
+      "grad_norm": 0.3458307366778419,
+      "kl": 0.41552734375,
+      "learning_rate": 3.4082399632965696e-07,
+      "loss": 0.0004,
+      "num_tokens": 33907474.0,
+      "reward": 0.51953125,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8071111111111111,
+      "grad_norm": 28.831014334944417,
+      "kl": 3.250244140625,
+      "learning_rate": 3.394798620153147e-07,
+      "loss": 0.0033,
+      "num_tokens": 34057594.0,
+      "reward": 0.54296875,
+      "reward_std": 0.08835469186306,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.6851911505727085,
+      "kl": 0.3564453125,
+      "learning_rate": 3.3813275445496766e-07,
+      "loss": 0.0004,
+      "num_tokens": 34207678.0,
+      "reward": 0.49609375,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8142222222222222,
+      "grad_norm": 4.524751208833203,
+      "kl": 0.817626953125,
+      "learning_rate": 3.367827184104437e-07,
+      "loss": 0.0008,
+      "num_tokens": 34357638.0,
+      "reward": 0.5625,
+      "reward_std": 0.05357225611805916,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8177777777777778,
+      "grad_norm": 0.7657245453002164,
+      "kl": 0.416748046875,
+      "learning_rate": 3.354297987408784e-07,
+      "loss": 0.0004,
+      "num_tokens": 34507658.0,
+      "reward": 0.5234375,
+      "reward_std": 0.078125,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.6307999515422514,
+      "kl": 0.386962890625,
+      "learning_rate": 3.340740404012251e-07,
+      "loss": 0.0004,
+      "num_tokens": 34657738.0,
+      "reward": 0.56640625,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8248888888888889,
+      "grad_norm": 0.7615234790182357,
+      "kl": 0.5302734375,
+      "learning_rate": 3.3271548844076034e-07,
+      "loss": 0.0005,
+      "num_tokens": 34807830.0,
+      "reward": 0.5,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8284444444444444,
+      "grad_norm": 45.22177711175558,
+      "kl": 7.133056640625,
+      "learning_rate": 3.313541880015877e-07,
+      "loss": 0.0071,
+      "num_tokens": 34957886.0,
+      "reward": 0.49609375,
+      "reward_std": 0.05710469186306,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.832,
+      "grad_norm": 0.4728352153390914,
+      "kl": 0.421142578125,
+      "learning_rate": 3.299901843171374e-07,
+      "loss": 0.0004,
+      "num_tokens": 35108018.0,
+      "reward": 0.51953125,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8355555555555556,
+      "grad_norm": 0.7556394805634985,
+      "kl": 0.410888671875,
+      "learning_rate": 3.2862352271066324e-07,
+      "loss": 0.0004,
+      "num_tokens": 35258154.0,
+      "reward": 0.51171875,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8391111111111111,
+      "grad_norm": 60.219344008615124,
+      "kl": 9.630859375,
+      "learning_rate": 3.272542485937368e-07,
+      "loss": 0.0097,
+      "num_tokens": 35408250.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.6258122056705749,
+      "kl": 0.3935546875,
+      "learning_rate": 3.2588240746473866e-07,
+      "loss": 0.0004,
+      "num_tokens": 35558338.0,
+      "reward": 0.546875,
+      "reward_std": 0.04400775954127312,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8462222222222222,
+      "grad_norm": 0.7556560670668717,
+      "kl": 0.403564453125,
+      "learning_rate": 3.245080449073459e-07,
+      "loss": 0.0004,
+      "num_tokens": 35708434.0,
+      "reward": 0.5078125,
+      "reward_std": 0.07152109593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8497777777777777,
+      "grad_norm": 0.6981235833583292,
+      "kl": 0.39892578125,
+      "learning_rate": 3.231312065890183e-07,
+      "loss": 0.0004,
+      "num_tokens": 35858494.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.7057474719839335,
+      "kl": 0.416748046875,
+      "learning_rate": 3.217519382594801e-07,
+      "loss": 0.0004,
+      "num_tokens": 36008622.0,
+      "reward": 0.5390625,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8568888888888889,
+      "grad_norm": 0.6774457062153861,
+      "kl": 0.3857421875,
+      "learning_rate": 3.203702857492005e-07,
+      "loss": 0.0004,
+      "num_tokens": 36158658.0,
+      "reward": 0.56640625,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8604444444444445,
+      "grad_norm": 0.8579385643539741,
+      "kl": 0.399658203125,
+      "learning_rate": 3.189862949678704e-07,
+      "loss": 0.0004,
+      "num_tokens": 36308694.0,
+      "reward": 0.48046875,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.864,
+      "grad_norm": 0.8918776174630627,
+      "kl": 0.478515625,
+      "learning_rate": 3.1760001190287695e-07,
+      "loss": 0.0005,
+      "num_tokens": 36458806.0,
+      "reward": 0.50390625,
+      "reward_std": 0.0859375,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9296875,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8675555555555555,
+      "grad_norm": 14.470212904907461,
+      "kl": 1.18115234375,
+      "learning_rate": 3.162114826177756e-07,
+      "loss": 0.0012,
+      "num_tokens": 36608858.0,
+      "reward": 0.5,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8711111111111111,
+      "grad_norm": 0.8767487112614263,
+      "kl": 0.48486328125,
+      "learning_rate": 3.148207532507595e-07,
+      "loss": 0.0005,
+      "num_tokens": 36758898.0,
+      "reward": 0.5546875,
+      "reward_std": 0.09375,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.7666837173815899,
+      "kl": 0.53466796875,
+      "learning_rate": 3.134278700131262e-07,
+      "loss": 0.0005,
+      "num_tokens": 36909062.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0817507952451706,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8782222222222222,
+      "grad_norm": 0.4660959690863843,
+      "kl": 0.361572265625,
+      "learning_rate": 3.1203287918774224e-07,
+      "loss": 0.0004,
+      "num_tokens": 37059178.0,
+      "reward": 0.50390625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8817777777777778,
+      "grad_norm": 0.7433620210188535,
+      "kl": 0.40185546875,
+      "learning_rate": 3.106358271275056e-07,
+      "loss": 0.0004,
+      "num_tokens": 37209262.0,
+      "reward": 0.484375,
+      "reward_std": 0.06491719186306,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8853333333333333,
+      "grad_norm": 1.3225278900078785,
+      "kl": 0.643310546875,
+      "learning_rate": 3.0923676025380483e-07,
+      "loss": 0.0006,
+      "num_tokens": 37359390.0,
+      "reward": 0.52734375,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8888888888888888,
+      "grad_norm": 1.3565201947424719,
+      "kl": 0.63818359375,
+      "learning_rate": 3.078357250549772e-07,
+      "loss": 0.0006,
+      "num_tokens": 37509438.0,
+      "reward": 0.5625,
+      "reward_std": 0.08054219186306,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8924444444444445,
+      "grad_norm": 0.7985946426687209,
+      "kl": 0.533203125,
+      "learning_rate": 3.064327680847635e-07,
+      "loss": 0.0005,
+      "num_tokens": 37659486.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.896,
+      "grad_norm": 0.7239646058609255,
+      "kl": 0.391357421875,
+      "learning_rate": 3.0502793596076136e-07,
+      "loss": 0.0004,
+      "num_tokens": 37809642.0,
+      "reward": 0.546875,
+      "reward_std": 0.07393828779459,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8995555555555556,
+      "grad_norm": 0.7771799102205033,
+      "kl": 0.45751953125,
+      "learning_rate": 3.0362127536287636e-07,
+      "loss": 0.0005,
+      "num_tokens": 37959778.0,
+      "reward": 0.48828125,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9031111111111111,
+      "grad_norm": 1.2830628968694793,
+      "kl": 0.66552734375,
+      "learning_rate": 3.022128330317705e-07,
+      "loss": 0.0007,
+      "num_tokens": 38109842.0,
+      "reward": 0.56640625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.5669445746630175,
+      "kl": 0.4755859375,
+      "learning_rate": 3.0080265576730977e-07,
+      "loss": 0.0005,
+      "num_tokens": 38260030.0,
+      "reward": 0.48046875,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9102222222222223,
+      "grad_norm": 0.6356374575865582,
+      "kl": 0.4111328125,
+      "learning_rate": 2.993907904270084e-07,
+      "loss": 0.0004,
+      "num_tokens": 38410070.0,
+      "reward": 0.49609375,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9137777777777778,
+      "grad_norm": 5.301041165177461,
+      "kl": 1.978759765625,
+      "learning_rate": 2.979772839244723e-07,
+      "loss": 0.002,
+      "num_tokens": 38560230.0,
+      "reward": 0.53125,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.6352697465433857,
+      "kl": 0.384033203125,
+      "learning_rate": 2.965621832278401e-07,
+      "loss": 0.0004,
+      "num_tokens": 38710354.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9208888888888889,
+      "grad_norm": 0.325841215358069,
+      "kl": 0.458740234375,
+      "learning_rate": 2.951455353582224e-07,
+      "loss": 0.0005,
+      "num_tokens": 38860410.0,
+      "reward": 0.53125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9244444444444444,
+      "grad_norm": 0.6229004603021032,
+      "kl": 0.416748046875,
+      "learning_rate": 2.937273873881396e-07,
+      "loss": 0.0004,
+      "num_tokens": 39010430.0,
+      "reward": 0.53125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.928,
+      "grad_norm": 1.3942727444158145,
+      "kl": 0.628662109375,
+      "learning_rate": 2.9230778643995724e-07,
+      "loss": 0.0006,
+      "num_tokens": 39160466.0,
+      "reward": 0.515625,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9315555555555556,
+      "grad_norm": 0.736925497405656,
+      "kl": 0.414794921875,
+      "learning_rate": 2.90886779684321e-07,
+      "loss": 0.0004,
+      "num_tokens": 39310574.0,
+      "reward": 0.53515625,
+      "reward_std": 0.05710469186306,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9351111111111111,
+      "grad_norm": 1.3153915670345286,
+      "kl": 0.77880859375,
+      "learning_rate": 2.894644143385885e-07,
+      "loss": 0.0008,
+      "num_tokens": 39460734.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0703125,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.3666627577855282,
+      "kl": 0.4052734375,
+      "learning_rate": 2.8804073766526095e-07,
+      "loss": 0.0004,
+      "num_tokens": 39610814.0,
+      "reward": 0.54296875,
+      "reward_std": 0.020570259541273117,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9422222222222222,
+      "grad_norm": 2.316729417862906,
+      "kl": 1.9755859375,
+      "learning_rate": 2.866157969704125e-07,
+      "loss": 0.002,
+      "num_tokens": 39760914.0,
+      "reward": 0.53125,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9457777777777778,
+      "grad_norm": 0.5361641805790162,
+      "kl": 0.3984375,
+      "learning_rate": 2.851896396021181e-07,
+      "loss": 0.0004,
+      "num_tokens": 39911018.0,
+      "reward": 0.53515625,
+      "reward_std": 0.030584799125790596,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9493333333333334,
+      "grad_norm": 2.578024238948283,
+      "kl": 0.693115234375,
+      "learning_rate": 2.837623129488808e-07,
+      "loss": 0.0007,
+      "num_tokens": 40061098.0,
+      "reward": 0.51953125,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9528888888888889,
+      "grad_norm": 0.5179523249592263,
+      "kl": 0.380859375,
+      "learning_rate": 2.823338644380566e-07,
+      "loss": 0.0004,
+      "num_tokens": 40211290.0,
+      "reward": 0.50390625,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9564444444444444,
+      "grad_norm": 0.6716890754838255,
+      "kl": 0.4169921875,
+      "learning_rate": 2.809043415342784e-07,
+      "loss": 0.0004,
+      "num_tokens": 40361402.0,
+      "reward": 0.546875,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.96,
+      "grad_norm": 1.8696840139920259,
+      "kl": 0.512451171875,
+      "learning_rate": 2.794737917378797e-07,
+      "loss": 0.0005,
+      "num_tokens": 40511494.0,
+      "reward": 0.515625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9635555555555556,
+      "grad_norm": 0.5775124536896724,
+      "kl": 0.4365234375,
+      "learning_rate": 2.780422625833153e-07,
+      "loss": 0.0004,
+      "num_tokens": 40661614.0,
+      "reward": 0.53125,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9671111111111111,
+      "grad_norm": 0.5174571918726184,
+      "kl": 0.40380859375,
+      "learning_rate": 2.766098016375823e-07,
+      "loss": 0.0004,
+      "num_tokens": 40811670.0,
+      "reward": 0.53125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9706666666666667,
+      "grad_norm": 10.856193948863917,
+      "kl": 2.42578125,
+      "learning_rate": 2.751764564986396e-07,
+      "loss": 0.0024,
+      "num_tokens": 40961810.0,
+      "reward": 0.53125,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9742222222222222,
+      "grad_norm": 0.5252541651736331,
+      "kl": 0.414306640625,
+      "learning_rate": 2.737422747938259e-07,
+      "loss": 0.0004,
+      "num_tokens": 41111894.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.584952826070172,
+      "kl": 0.364501953125,
+      "learning_rate": 2.723073041782776e-07,
+      "loss": 0.0004,
+      "num_tokens": 41262010.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.6583135007881525,
+      "kl": 0.427001953125,
+      "learning_rate": 2.708715923333451e-07,
+      "loss": 0.0004,
+      "num_tokens": 41412086.0,
+      "reward": 0.56640625,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9848888888888889,
+      "grad_norm": 0.4967388969256047,
+      "kl": 0.44140625,
+      "learning_rate": 2.6943518696500835e-07,
+      "loss": 0.0004,
+      "num_tokens": 41562150.0,
+      "reward": 0.51953125,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9884444444444445,
+      "grad_norm": 0.5621455003149533,
+      "kl": 0.484619140625,
+      "learning_rate": 2.6799813580229174e-07,
+      "loss": 0.0005,
+      "num_tokens": 41712282.0,
+      "reward": 0.5625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.992,
+      "grad_norm": 0.4746005261751424,
+      "kl": 0.3916015625,
+      "learning_rate": 2.6656048659567834e-07,
+      "loss": 0.0004,
+      "num_tokens": 41862370.0,
+      "reward": 0.5546875,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9955555555555555,
+      "grad_norm": 0.535734063401937,
+      "kl": 0.441162109375,
+      "learning_rate": 2.65122287115523e-07,
+      "loss": 0.0004,
+      "num_tokens": 42012482.0,
+      "reward": 0.55078125,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9991111111111111,
+      "grad_norm": 0.07429460937077817,
+      "kl": 0.3955078125,
+      "learning_rate": 2.63683585150465e-07,
+      "loss": 0.0004,
+      "num_tokens": 42162534.0,
+      "reward": 0.53125,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0035555555555555,
+      "grad_norm": 0.8470947965625311,
+      "kl": 0.468505859375,
+      "learning_rate": 2.622444285058404e-07,
+      "loss": 0.0005,
+      "num_tokens": 42312626.0,
+      "reward": 0.49609375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.007111111111111,
+      "grad_norm": 0.7246290147592683,
+      "kl": 0.474853515625,
+      "learning_rate": 2.6080486500209347e-07,
+      "loss": 0.0005,
+      "num_tokens": 42462734.0,
+      "reward": 0.5234375,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0106666666666666,
+      "grad_norm": 0.7970433352687321,
+      "kl": 0.417724609375,
+      "learning_rate": 2.5936494247318733e-07,
+      "loss": 0.0004,
+      "num_tokens": 42612814.0,
+      "reward": 0.5,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0142222222222221,
+      "grad_norm": 0.5963898764489264,
+      "kl": 0.490478515625,
+      "learning_rate": 2.5792470876501517e-07,
+      "loss": 0.0005,
+      "num_tokens": 42762926.0,
+      "reward": 0.4921875,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 911.0,
+      "completions/mean_length": 1023.1171875,
+      "completions/mean_terminated_length": 911.0,
+      "completions/min_length": 911.0,
+      "completions/min_terminated_length": 911.0,
+      "epoch": 1.0177777777777777,
+      "grad_norm": 0.7142379802123127,
+      "kl": 0.412841796875,
+      "learning_rate": 2.5648421173380974e-07,
+      "loss": -0.0009,
+      "num_tokens": 42912905.0,
+      "reward": 0.51171875,
+      "reward_std": 0.057104695588350296,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0213333333333334,
+      "grad_norm": 0.6099036684005931,
+      "kl": 0.409912109375,
+      "learning_rate": 2.550434992445538e-07,
+      "loss": 0.0004,
+      "num_tokens": 43062965.0,
+      "reward": 0.5390625,
+      "reward_std": 0.053028859198093414,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.024888888888889,
+      "grad_norm": 0.4520069380770881,
+      "kl": 0.437255859375,
+      "learning_rate": 2.536026191693893e-07,
+      "loss": 0.0004,
+      "num_tokens": 43213025.0,
+      "reward": 0.5390625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0284444444444445,
+      "grad_norm": 0.6319811742197129,
+      "kl": 0.369873046875,
+      "learning_rate": 2.521616193860266e-07,
+      "loss": 0.0004,
+      "num_tokens": 43363137.0,
+      "reward": 0.50390625,
+      "reward_std": 0.041479695588350296,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.032,
+      "grad_norm": 2.6591058310513374,
+      "kl": 0.842041015625,
+      "learning_rate": 2.507205477761539e-07,
+      "loss": 0.0008,
+      "num_tokens": 43513221.0,
+      "reward": 0.515625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0355555555555556,
+      "grad_norm": 0.5420344680148037,
+      "kl": 0.37548828125,
+      "learning_rate": 2.4927945222384613e-07,
+      "loss": 0.0004,
+      "num_tokens": 43663345.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.039111111111111,
+      "grad_norm": 0.5473848935681827,
+      "kl": 0.51171875,
+      "learning_rate": 2.4783838061397334e-07,
+      "loss": 0.0005,
+      "num_tokens": 43813417.0,
+      "reward": 0.515625,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0426666666666666,
+      "grad_norm": 0.7008917634695417,
+      "kl": 0.634765625,
+      "learning_rate": 2.4639738083061073e-07,
+      "loss": 0.0006,
+      "num_tokens": 43963469.0,
+      "reward": 0.51171875,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0462222222222222,
+      "grad_norm": 0.3295873056459611,
+      "kl": 0.4091796875,
+      "learning_rate": 2.4495650075544613e-07,
+      "loss": 0.0004,
+      "num_tokens": 44113521.0,
+      "reward": 0.53125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0497777777777777,
+      "grad_norm": 0.5414055832707978,
+      "kl": 0.39501953125,
+      "learning_rate": 2.435157882661903e-07,
+      "loss": 0.0004,
+      "num_tokens": 44263553.0,
+      "reward": 0.54296875,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0533333333333332,
+      "grad_norm": 0.4682537399126061,
+      "kl": 0.397705078125,
+      "learning_rate": 2.420752912349848e-07,
+      "loss": 0.0004,
+      "num_tokens": 44413705.0,
+      "reward": 0.53515625,
+      "reward_std": 0.045216359198093414,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.056888888888889,
+      "grad_norm": 0.922022274288094,
+      "kl": 0.557861328125,
+      "learning_rate": 2.4063505752681265e-07,
+      "loss": 0.0006,
+      "num_tokens": 44563845.0,
+      "reward": 0.46875,
+      "reward_std": 0.06491719186306,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.921875,
+      "rewards/format_reward_func/std": 0.2694226801395416,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0604444444444445,
+      "grad_norm": 0.2921901386740173,
+      "kl": 0.396484375,
+      "learning_rate": 2.3919513499790646e-07,
+      "loss": 0.0004,
+      "num_tokens": 44713933.0,
+      "reward": 0.5625,
+      "reward_std": 0.018042195588350296,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.064,
+      "grad_norm": 0.45593720122261916,
+      "kl": 0.45068359375,
+      "learning_rate": 2.3775557149415953e-07,
+      "loss": 0.0005,
+      "num_tokens": 44863933.0,
+      "reward": 0.5625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0675555555555556,
+      "grad_norm": 0.37302671430025486,
+      "kl": 0.4716796875,
+      "learning_rate": 2.3631641484953493e-07,
+      "loss": 0.0005,
+      "num_tokens": 45014049.0,
+      "reward": 0.5390625,
+      "reward_std": 0.009021097794175148,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0711111111111111,
+      "grad_norm": 0.39290892679689327,
+      "kl": 0.373046875,
+      "learning_rate": 2.3487771288447703e-07,
+      "loss": 0.0004,
+      "num_tokens": 45164101.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0746666666666667,
+      "grad_norm": 0.7056356953892552,
+      "kl": 0.4267578125,
+      "learning_rate": 2.3343951340432158e-07,
+      "loss": 0.0004,
+      "num_tokens": 45314165.0,
+      "reward": 0.51953125,
+      "reward_std": 0.06744526326656342,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0782222222222222,
+      "grad_norm": 0.5027562199597438,
+      "kl": 0.489501953125,
+      "learning_rate": 2.3200186419770823e-07,
+      "loss": 0.0005,
+      "num_tokens": 45464281.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0817777777777777,
+      "grad_norm": 0.2975548497811407,
+      "kl": 0.385498046875,
+      "learning_rate": 2.3056481303499163e-07,
+      "loss": 0.0004,
+      "num_tokens": 45614357.0,
+      "reward": 0.5390625,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0853333333333333,
+      "grad_norm": 0.43699072382589177,
+      "kl": 0.414306640625,
+      "learning_rate": 2.291284076666549e-07,
+      "loss": 0.0004,
+      "num_tokens": 45764381.0,
+      "reward": 0.5390625,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0888888888888888,
+      "grad_norm": 0.7264220978249349,
+      "kl": 0.537109375,
+      "learning_rate": 2.2769269582172236e-07,
+      "loss": 0.0005,
+      "num_tokens": 45914405.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0924444444444443,
+      "grad_norm": 0.8246989232799073,
+      "kl": 0.415771484375,
+      "learning_rate": 2.262577252061741e-07,
+      "loss": 0.0004,
+      "num_tokens": 46064497.0,
+      "reward": 0.56640625,
+      "reward_std": 0.07933359593153,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.096,
+      "grad_norm": 1.5117707972901218,
+      "kl": 0.685791015625,
+      "learning_rate": 2.2482354350136043e-07,
+      "loss": 0.0007,
+      "num_tokens": 46214529.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0995555555555556,
+      "grad_norm": 0.4384748657076267,
+      "kl": 0.436279296875,
+      "learning_rate": 2.2339019836241768e-07,
+      "loss": 0.0004,
+      "num_tokens": 46364633.0,
+      "reward": 0.51953125,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1031111111111112,
+      "grad_norm": 0.5367303888744025,
+      "kl": 0.478515625,
+      "learning_rate": 2.219577374166847e-07,
+      "loss": 0.0005,
+      "num_tokens": 46514697.0,
+      "reward": 0.5078125,
+      "reward_std": 0.03839729726314545,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1066666666666667,
+      "grad_norm": 0.5858380888629433,
+      "kl": 0.43408203125,
+      "learning_rate": 2.2052620826212031e-07,
+      "loss": 0.0004,
+      "num_tokens": 46664821.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1102222222222222,
+      "grad_norm": 0.641916377352643,
+      "kl": 0.55615234375,
+      "learning_rate": 2.1909565846572158e-07,
+      "loss": 0.0006,
+      "num_tokens": 46814953.0,
+      "reward": 0.48828125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1137777777777778,
+      "grad_norm": 0.33090250971908225,
+      "kl": 0.41796875,
+      "learning_rate": 2.1766613556194344e-07,
+      "loss": 0.0004,
+      "num_tokens": 46964997.0,
+      "reward": 0.578125,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1173333333333333,
+      "grad_norm": 15.61725314528695,
+      "kl": 3.2099609375,
+      "learning_rate": 2.1623768705111914e-07,
+      "loss": 0.0032,
+      "num_tokens": 47115081.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1208888888888888,
+      "grad_norm": 0.7107097215021569,
+      "kl": 0.523681640625,
+      "learning_rate": 2.1481036039788185e-07,
+      "loss": 0.0005,
+      "num_tokens": 47265145.0,
+      "reward": 0.57421875,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1244444444444444,
+      "grad_norm": 0.5636256684100517,
+      "kl": 0.508544921875,
+      "learning_rate": 2.133842030295875e-07,
+      "loss": 0.0005,
+      "num_tokens": 47415337.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1280000000000001,
+      "grad_norm": 0.5116667035209906,
+      "kl": 0.435791015625,
+      "learning_rate": 2.1195926233473905e-07,
+      "loss": 0.0004,
+      "num_tokens": 47565405.0,
+      "reward": 0.55859375,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1315555555555556,
+      "grad_norm": 0.6779613408018799,
+      "kl": 0.4248046875,
+      "learning_rate": 2.105355856614115e-07,
+      "loss": 0.0004,
+      "num_tokens": 47715461.0,
+      "reward": 0.5546875,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1351111111111112,
+      "grad_norm": 0.3921549457589845,
+      "kl": 0.412353515625,
+      "learning_rate": 2.0911322031567907e-07,
+      "loss": 0.0004,
+      "num_tokens": 47865549.0,
+      "reward": 0.53125,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1386666666666667,
+      "grad_norm": 0.4958321729986366,
+      "kl": 0.4521484375,
+      "learning_rate": 2.076922135600427e-07,
+      "loss": 0.0005,
+      "num_tokens": 48015629.0,
+      "reward": 0.5390625,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1422222222222222,
+      "grad_norm": 0.7423549988863264,
+      "kl": 0.406005859375,
+      "learning_rate": 2.0627261261186048e-07,
+      "loss": 0.0004,
+      "num_tokens": 48165705.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1457777777777778,
+      "grad_norm": 0.22087517892930575,
+      "kl": 0.41162109375,
+      "learning_rate": 2.0485446464177752e-07,
+      "loss": 0.0004,
+      "num_tokens": 48315833.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1493333333333333,
+      "grad_norm": 0.9688657800607915,
+      "kl": 0.743408203125,
+      "learning_rate": 2.034378167721599e-07,
+      "loss": 0.0007,
+      "num_tokens": 48465977.0,
+      "reward": 0.5234375,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1528888888888889,
+      "grad_norm": 0.5069073201298848,
+      "kl": 0.3955078125,
+      "learning_rate": 2.0202271607552766e-07,
+      "loss": 0.0004,
+      "num_tokens": 48616049.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1564444444444444,
+      "grad_norm": 0.30676846934679053,
+      "kl": 0.37060546875,
+      "learning_rate": 2.006092095729916e-07,
+      "loss": 0.0004,
+      "num_tokens": 48766117.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.16,
+      "grad_norm": 8.64784262784224,
+      "kl": 1.320556640625,
+      "learning_rate": 1.9919734423269018e-07,
+      "loss": 0.0013,
+      "num_tokens": 48916213.0,
+      "reward": 0.53125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1635555555555555,
+      "grad_norm": 0.5177358301467603,
+      "kl": 0.4267578125,
+      "learning_rate": 1.9778716696822948e-07,
+      "loss": 0.0004,
+      "num_tokens": 49066345.0,
+      "reward": 0.546875,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1671111111111112,
+      "grad_norm": 0.29160214419010005,
+      "kl": 0.4140625,
+      "learning_rate": 1.9637872463712362e-07,
+      "loss": 0.0004,
+      "num_tokens": 49216417.0,
+      "reward": 0.53125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1706666666666667,
+      "grad_norm": 0.37306669591193975,
+      "kl": 0.389404296875,
+      "learning_rate": 1.9497206403923864e-07,
+      "loss": 0.0004,
+      "num_tokens": 49366501.0,
+      "reward": 0.546875,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1742222222222223,
+      "grad_norm": 0.3457412118045277,
+      "kl": 0.439697265625,
+      "learning_rate": 1.9356723191523646e-07,
+      "loss": 0.0004,
+      "num_tokens": 49516501.0,
+      "reward": 0.515625,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1777777777777778,
+      "grad_norm": 0.49964396837602626,
+      "kl": 0.417236328125,
+      "learning_rate": 1.921642749450228e-07,
+      "loss": 0.0004,
+      "num_tokens": 49666537.0,
+      "reward": 0.609375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.2265625,
+      "rewards/equation_reward_func/std": 0.4202519655227661,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1813333333333333,
+      "grad_norm": 0.4051353842023595,
+      "kl": 0.473388671875,
+      "learning_rate": 1.9076323974619512e-07,
+      "loss": 0.0005,
+      "num_tokens": 49816629.0,
+      "reward": 0.5703125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1848888888888889,
+      "grad_norm": 5.378834991563153,
+      "kl": 1.024658203125,
+      "learning_rate": 1.8936417287249446e-07,
+      "loss": 0.001,
+      "num_tokens": 49966761.0,
+      "reward": 0.5390625,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1884444444444444,
+      "grad_norm": 0.6720029560481326,
+      "kl": 0.527099609375,
+      "learning_rate": 1.8796712081225774e-07,
+      "loss": 0.0005,
+      "num_tokens": 50116905.0,
+      "reward": 0.55859375,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 1022.515625,
+      "completions/mean_terminated_length": 834.0,
+      "completions/min_length": 834.0,
+      "completions/min_terminated_length": 834.0,
+      "epoch": 1.192,
+      "grad_norm": 0.6578900739907899,
+      "kl": 0.492431640625,
+      "learning_rate": 1.8657212998687388e-07,
+      "loss": 0.0005,
+      "num_tokens": 50266775.0,
+      "reward": 0.56640625,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.1955555555555555,
+      "grad_norm": 0.5860458852325504,
+      "kl": 0.45849609375,
+      "learning_rate": 1.8517924674924046e-07,
+      "loss": 0.0005,
+      "num_tokens": 50416907.0,
+      "reward": 0.515625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.199111111111111,
+      "grad_norm": 0.5764488924096699,
+      "kl": 0.3935546875,
+      "learning_rate": 1.8378851738222439e-07,
+      "loss": 0.0004,
+      "num_tokens": 50566947.0,
+      "reward": 0.5625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2026666666666666,
+      "grad_norm": 0.3101655260933977,
+      "kl": 0.466552734375,
+      "learning_rate": 1.82399988097123e-07,
+      "loss": 0.0005,
+      "num_tokens": 50717059.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2062222222222223,
+      "grad_norm": 0.25806921540657135,
+      "kl": 0.427001953125,
+      "learning_rate": 1.8101370503212962e-07,
+      "loss": 0.0004,
+      "num_tokens": 50867127.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2097777777777778,
+      "grad_norm": 0.38683952780832426,
+      "kl": 0.398193359375,
+      "learning_rate": 1.7962971425079946e-07,
+      "loss": 0.0004,
+      "num_tokens": 51017191.0,
+      "reward": 0.55859375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2133333333333334,
+      "grad_norm": 0.43762529890937657,
+      "kl": 0.40673828125,
+      "learning_rate": 1.7824806174051994e-07,
+      "loss": 0.0004,
+      "num_tokens": 51167303.0,
+      "reward": 0.55078125,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.216888888888889,
+      "grad_norm": 0.4947585618325522,
+      "kl": 0.500732421875,
+      "learning_rate": 1.7686879341098172e-07,
+      "loss": 0.0005,
+      "num_tokens": 51317335.0,
+      "reward": 0.5546875,
+      "reward_std": 0.033667195588350296,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2204444444444444,
+      "grad_norm": 0.29648752247386784,
+      "kl": 0.454345703125,
+      "learning_rate": 1.7549195509265407e-07,
+      "loss": 0.0005,
+      "num_tokens": 51467455.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.224,
+      "grad_norm": 0.3522267627578455,
+      "kl": 0.41064453125,
+      "learning_rate": 1.7411759253526137e-07,
+      "loss": 0.0004,
+      "num_tokens": 51617551.0,
+      "reward": 0.5546875,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2275555555555555,
+      "grad_norm": 3.809079002025788,
+      "kl": 1.059814453125,
+      "learning_rate": 1.7274575140626315e-07,
+      "loss": 0.0011,
+      "num_tokens": 51767667.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.231111111111111,
+      "grad_norm": 0.20335882404499606,
+      "kl": 0.433837890625,
+      "learning_rate": 1.713764772893368e-07,
+      "loss": 0.0004,
+      "num_tokens": 51917775.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2346666666666666,
+      "grad_norm": 0.40351519211548675,
+      "kl": 0.44384765625,
+      "learning_rate": 1.7000981568286263e-07,
+      "loss": 0.0004,
+      "num_tokens": 52067895.0,
+      "reward": 0.546875,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2382222222222223,
+      "grad_norm": 0.5942030016843242,
+      "kl": 0.45703125,
+      "learning_rate": 1.6864581199841226e-07,
+      "loss": 0.0005,
+      "num_tokens": 52218003.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2417777777777779,
+      "grad_norm": 0.529993570144883,
+      "kl": 0.404052734375,
+      "learning_rate": 1.6728451155923966e-07,
+      "loss": 0.0004,
+      "num_tokens": 52368071.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2453333333333334,
+      "grad_norm": 0.8041631283434129,
+      "kl": 0.8837890625,
+      "learning_rate": 1.6592595959877493e-07,
+      "loss": 0.0009,
+      "num_tokens": 52518159.0,
+      "reward": 0.546875,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.248888888888889,
+      "grad_norm": 1.6872516484862146,
+      "kl": 0.507080078125,
+      "learning_rate": 1.6457020125912158e-07,
+      "loss": 0.0005,
+      "num_tokens": 52668287.0,
+      "reward": 0.51171875,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2524444444444445,
+      "grad_norm": 0.6429164573265299,
+      "kl": 0.455810546875,
+      "learning_rate": 1.6321728158955633e-07,
+      "loss": 0.0005,
+      "num_tokens": 52818387.0,
+      "reward": 0.5703125,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.256,
+      "grad_norm": 0.6468681508234729,
+      "kl": 0.43701171875,
+      "learning_rate": 1.6186724554503237e-07,
+      "loss": 0.0004,
+      "num_tokens": 52968427.0,
+      "reward": 0.51171875,
+      "reward_std": 0.055230896919965744,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2595555555555555,
+      "grad_norm": 0.44088485589071047,
+      "kl": 0.397216796875,
+      "learning_rate": 1.6052013798468528e-07,
+      "loss": 0.0004,
+      "num_tokens": 53118535.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.263111111111111,
+      "grad_norm": 0.3945491441045777,
+      "kl": 0.38134765625,
+      "learning_rate": 1.5917600367034302e-07,
+      "loss": 0.0004,
+      "num_tokens": 53268763.0,
+      "reward": 0.4921875,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2666666666666666,
+      "grad_norm": 0.638915571522292,
+      "kl": 0.408447265625,
+      "learning_rate": 1.578348872650378e-07,
+      "loss": 0.0004,
+      "num_tokens": 53418919.0,
+      "reward": 0.48828125,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2702222222222221,
+      "grad_norm": 0.5121027616204278,
+      "kl": 0.429931640625,
+      "learning_rate": 1.564968333315229e-07,
+      "loss": 0.0004,
+      "num_tokens": 53568999.0,
+      "reward": 0.55859375,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2737777777777777,
+      "grad_norm": 0.5563613028742419,
+      "kl": 0.509521484375,
+      "learning_rate": 1.5516188633079107e-07,
+      "loss": 0.0005,
+      "num_tokens": 53719115.0,
+      "reward": 0.51953125,
+      "reward_std": 0.04147969186306,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2773333333333334,
+      "grad_norm": 0.5962361306068441,
+      "kl": 0.387939453125,
+      "learning_rate": 1.5383009062059794e-07,
+      "loss": 0.0004,
+      "num_tokens": 53869339.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.280888888888889,
+      "grad_norm": 0.8520264003008496,
+      "kl": 0.6318359375,
+      "learning_rate": 1.525014904539873e-07,
+      "loss": 0.0006,
+      "num_tokens": 54019455.0,
+      "reward": 0.53125,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2844444444444445,
+      "grad_norm": 0.4853809422457624,
+      "kl": 0.45458984375,
+      "learning_rate": 1.5117612997782158e-07,
+      "loss": 0.0005,
+      "num_tokens": 54169571.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.288,
+      "grad_norm": 0.3747330328198201,
+      "kl": 0.42333984375,
+      "learning_rate": 1.49854053231314e-07,
+      "loss": 0.0004,
+      "num_tokens": 54319603.0,
+      "reward": 0.546875,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2915555555555556,
+      "grad_norm": 0.5492601505442783,
+      "kl": 0.47265625,
+      "learning_rate": 1.4853530414456612e-07,
+      "loss": 0.0005,
+      "num_tokens": 54469607.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.295111111111111,
+      "grad_norm": 0.33780973836896694,
+      "kl": 0.455810546875,
+      "learning_rate": 1.4721992653710718e-07,
+      "loss": 0.0005,
+      "num_tokens": 54619703.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.2986666666666666,
+      "grad_norm": 0.5018740934387684,
+      "kl": 0.458251953125,
+      "learning_rate": 1.45907964116439e-07,
+      "loss": 0.0005,
+      "num_tokens": 54769755.0,
+      "reward": 0.5703125,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3022222222222222,
+      "grad_norm": 0.48892083551158816,
+      "kl": 0.4052734375,
+      "learning_rate": 1.4459946047658305e-07,
+      "loss": 0.0004,
+      "num_tokens": 54919835.0,
+      "reward": 0.5234375,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3057777777777777,
+      "grad_norm": 0.35313257655187524,
+      "kl": 0.4345703125,
+      "learning_rate": 1.4329445909663194e-07,
+      "loss": 0.0004,
+      "num_tokens": 55069915.0,
+      "reward": 0.53125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3093333333333335,
+      "grad_norm": 1.3640081478037656,
+      "kl": 0.619384765625,
+      "learning_rate": 1.4199300333930515e-07,
+      "loss": 0.0006,
+      "num_tokens": 55219987.0,
+      "reward": 0.49609375,
+      "reward_std": 0.04620979726314545,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3128888888888888,
+      "grad_norm": 0.39606833878919273,
+      "kl": 0.41650390625,
+      "learning_rate": 1.4069513644950744e-07,
+      "loss": 0.0004,
+      "num_tokens": 55370151.0,
+      "reward": 0.4921875,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 1021.8046875,
+      "completions/mean_terminated_length": 743.0,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "epoch": 1.3164444444444445,
+      "grad_norm": 0.44470406838575705,
+      "kl": 0.40087890625,
+      "learning_rate": 1.394009015528927e-07,
+      "loss": 0.0004,
+      "num_tokens": 55520026.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.32,
+      "grad_norm": 0.3552220431069685,
+      "kl": 0.37841796875,
+      "learning_rate": 1.3811034165443036e-07,
+      "loss": 0.0004,
+      "num_tokens": 55670030.0,
+      "reward": 0.53515625,
+      "reward_std": 0.025854695588350296,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3235555555555556,
+      "grad_norm": 0.44156036308113156,
+      "kl": 0.417724609375,
+      "learning_rate": 1.3682349963697676e-07,
+      "loss": 0.0004,
+      "num_tokens": 55820110.0,
+      "reward": 0.48828125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3271111111111111,
+      "grad_norm": 0.8049133934652881,
+      "kl": 0.50244140625,
+      "learning_rate": 1.3554041825985e-07,
+      "loss": 0.0005,
+      "num_tokens": 55970202.0,
+      "reward": 0.5,
+      "reward_std": 0.09616719186306,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3306666666666667,
+      "grad_norm": 0.4029732452394675,
+      "kl": 0.392333984375,
+      "learning_rate": 1.3426114015740915e-07,
+      "loss": 0.0004,
+      "num_tokens": 56120286.0,
+      "reward": 0.5703125,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3342222222222222,
+      "grad_norm": 0.6289438946544794,
+      "kl": 0.447021484375,
+      "learning_rate": 1.3298570783763805e-07,
+      "loss": 0.0004,
+      "num_tokens": 56270414.0,
+      "reward": 0.57421875,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3377777777777777,
+      "grad_norm": 0.5979204566593044,
+      "kl": 0.445556640625,
+      "learning_rate": 1.31714163680732e-07,
+      "loss": 0.0004,
+      "num_tokens": 56420570.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3413333333333333,
+      "grad_norm": 0.5602574601449087,
+      "kl": 0.440185546875,
+      "learning_rate": 1.3044654993769044e-07,
+      "loss": 0.0004,
+      "num_tokens": 56570618.0,
+      "reward": 0.5859375,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3448888888888888,
+      "grad_norm": 0.584595173061934,
+      "kl": 0.387451171875,
+      "learning_rate": 1.2918290872891236e-07,
+      "loss": 0.0004,
+      "num_tokens": 56720726.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3484444444444446,
+      "grad_norm": 0.38195153284699324,
+      "kl": 0.42041015625,
+      "learning_rate": 1.2792328204279712e-07,
+      "loss": 0.0004,
+      "num_tokens": 56870738.0,
+      "reward": 0.5234375,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3519999999999999,
+      "grad_norm": 0.534164470884044,
+      "kl": 0.380126953125,
+      "learning_rate": 1.2666771173434892e-07,
+      "loss": 0.0004,
+      "num_tokens": 57020866.0,
+      "reward": 0.484375,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3555555555555556,
+      "grad_norm": 0.31327674285986346,
+      "kl": 0.404296875,
+      "learning_rate": 1.2541623952378655e-07,
+      "loss": 0.0004,
+      "num_tokens": 57170962.0,
+      "reward": 0.5,
+      "reward_std": 0.022772299125790596,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3591111111111112,
+      "grad_norm": 0.6333355146035587,
+      "kl": 0.42822265625,
+      "learning_rate": 1.2416890699515636e-07,
+      "loss": 0.0004,
+      "num_tokens": 57320938.0,
+      "reward": 0.5703125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3626666666666667,
+      "grad_norm": 0.8455919752181283,
+      "kl": 0.582275390625,
+      "learning_rate": 1.2292575559495143e-07,
+      "loss": 0.0006,
+      "num_tokens": 57470974.0,
+      "reward": 0.5,
+      "reward_std": 0.037403859198093414,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3662222222222222,
+      "grad_norm": 0.5563433958155662,
+      "kl": 0.4375,
+      "learning_rate": 1.216868266307333e-07,
+      "loss": 0.0004,
+      "num_tokens": 57621114.0,
+      "reward": 0.5234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3697777777777778,
+      "grad_norm": 0.42654455439415234,
+      "kl": 0.437744140625,
+      "learning_rate": 1.2045216126976054e-07,
+      "loss": 0.0004,
+      "num_tokens": 57771166.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3733333333333333,
+      "grad_norm": 0.574456938538095,
+      "kl": 0.464111328125,
+      "learning_rate": 1.1922180053761985e-07,
+      "loss": 0.0005,
+      "num_tokens": 57921230.0,
+      "reward": 0.5390625,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3768888888888888,
+      "grad_norm": 0.7083015972044223,
+      "kl": 0.5546875,
+      "learning_rate": 1.1799578531686355e-07,
+      "loss": 0.0006,
+      "num_tokens": 58071282.0,
+      "reward": 0.5390625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3804444444444444,
+      "grad_norm": 0.8083602759346432,
+      "kl": 0.543212890625,
+      "learning_rate": 1.1677415634565066e-07,
+      "loss": 0.0005,
+      "num_tokens": 58221330.0,
+      "reward": 0.5625,
+      "reward_std": 0.07206448912620544,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.384,
+      "grad_norm": 0.8332269118489771,
+      "kl": 0.646240234375,
+      "learning_rate": 1.1555695421639369e-07,
+      "loss": 0.0006,
+      "num_tokens": 58371338.0,
+      "reward": 0.515625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3875555555555557,
+      "grad_norm": 0.6373859717105615,
+      "kl": 0.530029296875,
+      "learning_rate": 1.1434421937440927e-07,
+      "loss": 0.0005,
+      "num_tokens": 58521394.0,
+      "reward": 0.51953125,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3911111111111112,
+      "grad_norm": 0.5222272117027472,
+      "kl": 0.438232421875,
+      "learning_rate": 1.1313599211657493e-07,
+      "loss": 0.0004,
+      "num_tokens": 58671582.0,
+      "reward": 0.50390625,
+      "reward_std": 0.03619525954127312,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3946666666666667,
+      "grad_norm": 0.44409467166709055,
+      "kl": 0.444091796875,
+      "learning_rate": 1.1193231258998933e-07,
+      "loss": 0.0004,
+      "num_tokens": 58821646.0,
+      "reward": 0.56640625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.3982222222222223,
+      "grad_norm": 0.5442250998097384,
+      "kl": 0.53125,
+      "learning_rate": 1.1073322079063913e-07,
+      "loss": 0.0005,
+      "num_tokens": 58971662.0,
+      "reward": 0.53515625,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4017777777777778,
+      "grad_norm": 1.553408988777784,
+      "kl": 0.532958984375,
+      "learning_rate": 1.0953875656206896e-07,
+      "loss": 0.0005,
+      "num_tokens": 59121722.0,
+      "reward": 0.52734375,
+      "reward_std": 0.05710469186306,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4053333333333333,
+      "grad_norm": 0.5740208836758302,
+      "kl": 0.438232421875,
+      "learning_rate": 1.083489595940586e-07,
+      "loss": 0.0004,
+      "num_tokens": 59271834.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4088888888888889,
+      "grad_norm": 0.4151009063244617,
+      "kl": 0.42919921875,
+      "learning_rate": 1.0716386942130312e-07,
+      "loss": 0.0004,
+      "num_tokens": 59421910.0,
+      "reward": 0.5234375,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4124444444444444,
+      "grad_norm": 0.7311720397309218,
+      "kl": 0.43115234375,
+      "learning_rate": 1.0598352542210021e-07,
+      "loss": 0.0004,
+      "num_tokens": 59571914.0,
+      "reward": 0.53515625,
+      "reward_std": 0.05710469186306,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.416,
+      "grad_norm": 0.6992515497367471,
+      "kl": 0.637939453125,
+      "learning_rate": 1.0480796681704077e-07,
+      "loss": 0.0006,
+      "num_tokens": 59722026.0,
+      "reward": 0.53125,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4195555555555557,
+      "grad_norm": 0.5217670799751064,
+      "kl": 0.43408203125,
+      "learning_rate": 1.0363723266770649e-07,
+      "loss": 0.0004,
+      "num_tokens": 59872062.0,
+      "reward": 0.51171875,
+      "reward_std": 0.041479695588350296,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.423111111111111,
+      "grad_norm": 6.850730801381474,
+      "kl": 1.605712890625,
+      "learning_rate": 1.0247136187537123e-07,
+      "loss": 0.0016,
+      "num_tokens": 60022170.0,
+      "reward": 0.5078125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4266666666666667,
+      "grad_norm": 0.5426555800683942,
+      "kl": 0.46826171875,
+      "learning_rate": 1.0131039317970907e-07,
+      "loss": 0.0005,
+      "num_tokens": 60172198.0,
+      "reward": 0.58203125,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4302222222222223,
+      "grad_norm": 0.4359928548595561,
+      "kl": 0.419921875,
+      "learning_rate": 1.0015436515750636e-07,
+      "loss": 0.0004,
+      "num_tokens": 60322274.0,
+      "reward": 0.5703125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4337777777777778,
+      "grad_norm": 0.5019244396354806,
+      "kl": 0.3994140625,
+      "learning_rate": 9.900331622138063e-08,
+      "loss": 0.0004,
+      "num_tokens": 60472338.0,
+      "reward": 0.57421875,
+      "reward_std": 0.041479695588350296,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4373333333333334,
+      "grad_norm": 0.6518642054974103,
+      "kl": 0.466552734375,
+      "learning_rate": 9.785728461850346e-08,
+      "loss": 0.0005,
+      "num_tokens": 60622438.0,
+      "reward": 0.5390625,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4408888888888889,
+      "grad_norm": 0.6183842691534489,
+      "kl": 0.401123046875,
+      "learning_rate": 9.671630842933027e-08,
+      "loss": 0.0004,
+      "num_tokens": 60772530.0,
+      "reward": 0.60546875,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.2421875,
+      "rewards/equation_reward_func/std": 0.4300905168056488,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.5614152227928939,
+      "kl": 0.42724609375,
+      "learning_rate": 9.558042556633439e-08,
+      "loss": 0.0004,
+      "num_tokens": 60922622.0,
+      "reward": 0.515625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.448,
+      "grad_norm": 0.7515661137862419,
+      "kl": 0.38232421875,
+      "learning_rate": 9.44496737727479e-08,
+      "loss": 0.0004,
+      "num_tokens": 61072702.0,
+      "reward": 0.5390625,
+      "reward_std": 0.07525776326656342,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4515555555555555,
+      "grad_norm": 0.6888661898947002,
+      "kl": 0.446044921875,
+      "learning_rate": 9.332409062130686e-08,
+      "loss": 0.0004,
+      "num_tokens": 61222818.0,
+      "reward": 0.5546875,
+      "reward_std": 0.06491719186306,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.455111111111111,
+      "grad_norm": 5.768513710763025,
+      "kl": 1.973388671875,
+      "learning_rate": 9.220371351300352e-08,
+      "loss": 0.002,
+      "num_tokens": 61372914.0,
+      "reward": 0.59375,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.203125,
+      "rewards/equation_reward_func/std": 0.40390563011169434,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9921875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 1022.25,
+      "completions/mean_terminated_length": 800.0,
+      "completions/min_length": 800.0,
+      "completions/min_terminated_length": 800.0,
+      "epoch": 1.4586666666666668,
+      "grad_norm": 0.39703873761876407,
+      "kl": 0.51416015625,
+      "learning_rate": 9.10885796758428e-08,
+      "loss": 0.0005,
+      "num_tokens": 61522750.0,
+      "reward": 0.5078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.462222222222222,
+      "grad_norm": 0.4105299696973317,
+      "kl": 0.400634765625,
+      "learning_rate": 8.997872616360603e-08,
+      "loss": 0.0004,
+      "num_tokens": 61672774.0,
+      "reward": 0.5078125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4657777777777778,
+      "grad_norm": 0.814786116863996,
+      "kl": 0.501953125,
+      "learning_rate": 8.887418985461903e-08,
+      "loss": 0.0005,
+      "num_tokens": 61822806.0,
+      "reward": 0.484375,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4693333333333334,
+      "grad_norm": 0.6543891528432907,
+      "kl": 0.448974609375,
+      "learning_rate": 8.777500745052743e-08,
+      "loss": 0.0004,
+      "num_tokens": 61972982.0,
+      "reward": 0.484375,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.472888888888889,
+      "grad_norm": 0.3103412060077848,
+      "kl": 0.407958984375,
+      "learning_rate": 8.668121547507634e-08,
+      "loss": 0.0004,
+      "num_tokens": 62123038.0,
+      "reward": 0.53125,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4764444444444444,
+      "grad_norm": 0.5962802113040242,
+      "kl": 0.478515625,
+      "learning_rate": 8.559285027289753e-08,
+      "loss": 0.0005,
+      "num_tokens": 62273026.0,
+      "reward": 0.59375,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.2109375,
+      "rewards/equation_reward_func/std": 0.4095771610736847,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.48,
+      "grad_norm": 0.6264823134539922,
+      "kl": 0.448486328125,
+      "learning_rate": 8.450994800830111e-08,
+      "loss": 0.0004,
+      "num_tokens": 62423086.0,
+      "reward": 0.53125,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4835555555555555,
+      "grad_norm": 0.8450656426538271,
+      "kl": 0.41796875,
+      "learning_rate": 8.343254466407435e-08,
+      "loss": 0.0004,
+      "num_tokens": 62573242.0,
+      "reward": 0.5234375,
+      "reward_std": 0.08427885919809341,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.487111111111111,
+      "grad_norm": 0.4851276707023854,
+      "kl": 0.388427734375,
+      "learning_rate": 8.236067604028562e-08,
+      "loss": 0.0004,
+      "num_tokens": 62723310.0,
+      "reward": 0.515625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4906666666666666,
+      "grad_norm": 0.529824828272533,
+      "kl": 0.39306640625,
+      "learning_rate": 8.129437775309533e-08,
+      "loss": 0.0004,
+      "num_tokens": 62873314.0,
+      "reward": 0.53125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4942222222222221,
+      "grad_norm": 0.7362472960586969,
+      "kl": 0.423583984375,
+      "learning_rate": 8.023368523357182e-08,
+      "loss": 0.0004,
+      "num_tokens": 63023462.0,
+      "reward": 0.51953125,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.4977777777777779,
+      "grad_norm": 0.7821251913385924,
+      "kl": 0.479736328125,
+      "learning_rate": 7.917863372651476e-08,
+      "loss": 0.0005,
+      "num_tokens": 63173582.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0817507952451706,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5013333333333332,
+      "grad_norm": 0.29499660137723543,
+      "kl": 0.4189453125,
+      "learning_rate": 7.812925828928332e-08,
+      "loss": 0.0004,
+      "num_tokens": 63323694.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.504888888888889,
+      "grad_norm": 3.151628702237811,
+      "kl": 1.814453125,
+      "learning_rate": 7.708559379063204e-08,
+      "loss": 0.0018,
+      "num_tokens": 63473758.0,
+      "reward": 0.5390625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5084444444444445,
+      "grad_norm": 0.42289374767757515,
+      "kl": 0.399658203125,
+      "learning_rate": 7.604767490955138e-08,
+      "loss": 0.0004,
+      "num_tokens": 63623866.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.512,
+      "grad_norm": 0.810138453876871,
+      "kl": 0.49365234375,
+      "learning_rate": 7.501553613411626e-08,
+      "loss": 0.0005,
+      "num_tokens": 63774038.0,
+      "reward": 0.5234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5155555555555555,
+      "grad_norm": 0.5491563744374505,
+      "kl": 0.431396484375,
+      "learning_rate": 7.398921176033928e-08,
+      "loss": 0.0004,
+      "num_tokens": 63924126.0,
+      "reward": 0.50390625,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.519111111111111,
+      "grad_norm": 0.7576014460271948,
+      "kl": 0.43359375,
+      "learning_rate": 7.296873589103184e-08,
+      "loss": 0.0004,
+      "num_tokens": 64074174.0,
+      "reward": 0.546875,
+      "reward_std": 0.06491719186306,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5226666666666666,
+      "grad_norm": 0.5650904936953062,
+      "kl": 0.400634765625,
+      "learning_rate": 7.195414243467029e-08,
+      "loss": 0.0004,
+      "num_tokens": 64224258.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5262222222222221,
+      "grad_norm": 1.7191187286350949,
+      "kl": 0.529541015625,
+      "learning_rate": 7.094546510426994e-08,
+      "loss": 0.0005,
+      "num_tokens": 64374342.0,
+      "reward": 0.55859375,
+      "reward_std": 0.07933359593153,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.529777777777778,
+      "grad_norm": 0.6477386570332615,
+      "kl": 0.448974609375,
+      "learning_rate": 6.994273741626405e-08,
+      "loss": 0.0004,
+      "num_tokens": 64524470.0,
+      "reward": 0.59375,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.21875,
+      "rewards/equation_reward_func/std": 0.41502299904823303,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.49484841194714935,
+      "kl": 0.42822265625,
+      "learning_rate": 6.8945992689391e-08,
+      "loss": 0.0004,
+      "num_tokens": 64674642.0,
+      "reward": 0.57421875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.536888888888889,
+      "grad_norm": 0.5641664834282943,
+      "kl": 0.511962890625,
+      "learning_rate": 6.795526404358628e-08,
+      "loss": 0.0005,
+      "num_tokens": 64824714.0,
+      "reward": 0.55859375,
+      "reward_std": 0.05050079524517059,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5404444444444443,
+      "grad_norm": 0.7774048704942202,
+      "kl": 0.4755859375,
+      "learning_rate": 6.697058439888283e-08,
+      "loss": 0.0005,
+      "num_tokens": 64974826.0,
+      "reward": 0.52734375,
+      "reward_std": 0.08307026326656342,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.544,
+      "grad_norm": 0.5632886984383448,
+      "kl": 0.44482421875,
+      "learning_rate": 6.599198647431642e-08,
+      "loss": 0.0004,
+      "num_tokens": 65125010.0,
+      "reward": 0.5390625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5475555555555556,
+      "grad_norm": 0.6453682372916957,
+      "kl": 0.515869140625,
+      "learning_rate": 6.501950278683907e-08,
+      "loss": 0.0005,
+      "num_tokens": 65275094.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.551111111111111,
+      "grad_norm": 0.5003006401865986,
+      "kl": 0.449462890625,
+      "learning_rate": 6.405316565023805e-08,
+      "loss": 0.0004,
+      "num_tokens": 65425174.0,
+      "reward": 0.55078125,
+      "reward_std": 0.03619525954127312,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5546666666666666,
+      "grad_norm": 0.468183524036864,
+      "kl": 0.427490234375,
+      "learning_rate": 6.309300717406274e-08,
+      "loss": 0.0004,
+      "num_tokens": 65575170.0,
+      "reward": 0.578125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5582222222222222,
+      "grad_norm": 0.4013690840146761,
+      "kl": 0.392333984375,
+      "learning_rate": 6.213905926255697e-08,
+      "loss": 0.0004,
+      "num_tokens": 65725174.0,
+      "reward": 0.5625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.561777777777778,
+      "grad_norm": 0.5052845789767202,
+      "kl": 0.395263671875,
+      "learning_rate": 6.119135361359965e-08,
+      "loss": 0.0004,
+      "num_tokens": 65875286.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5653333333333332,
+      "grad_norm": 0.7242539146561551,
+      "kl": 0.483642578125,
+      "learning_rate": 6.024992171765089e-08,
+      "loss": 0.0005,
+      "num_tokens": 66025386.0,
+      "reward": 0.5859375,
+      "reward_std": 0.07866839319467545,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.568888888888889,
+      "grad_norm": 0.3684742193914073,
+      "kl": 0.396728515625,
+      "learning_rate": 5.9314794856705983e-08,
+      "loss": 0.0004,
+      "num_tokens": 66175466.0,
+      "reward": 0.546875,
+      "reward_std": 0.037403859198093414,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5724444444444443,
+      "grad_norm": 0.49890493402893316,
+      "kl": 0.42822265625,
+      "learning_rate": 5.8386004103255975e-08,
+      "loss": 0.0004,
+      "num_tokens": 66325550.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.576,
+      "grad_norm": 0.4438983691416006,
+      "kl": 0.461181640625,
+      "learning_rate": 5.7463580319254853e-08,
+      "loss": 0.0005,
+      "num_tokens": 66475646.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5795555555555556,
+      "grad_norm": 0.48341809431444255,
+      "kl": 0.456298828125,
+      "learning_rate": 5.6547554155094626e-08,
+      "loss": 0.0005,
+      "num_tokens": 66625682.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5831111111111111,
+      "grad_norm": 0.7056142221033674,
+      "kl": 0.470703125,
+      "learning_rate": 5.563795604858615e-08,
+      "loss": 0.0005,
+      "num_tokens": 66775798.0,
+      "reward": 0.52734375,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5866666666666667,
+      "grad_norm": 6.779742188034057,
+      "kl": 0.804931640625,
+      "learning_rate": 5.473481622394849e-08,
+      "loss": 0.0008,
+      "num_tokens": 66925906.0,
+      "reward": 0.546875,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5902222222222222,
+      "grad_norm": 1.40942249947706,
+      "kl": 0.533203125,
+      "learning_rate": 5.3838164690803935e-08,
+      "loss": 0.0005,
+      "num_tokens": 67075970.0,
+      "reward": 0.51171875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5937777777777777,
+      "grad_norm": 0.5709869106367856,
+      "kl": 0.514892578125,
+      "learning_rate": 5.294803124318145e-08,
+      "loss": 0.0005,
+      "num_tokens": 67226142.0,
+      "reward": 0.5390625,
+      "reward_std": 0.047418396919965744,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.5973333333333333,
+      "grad_norm": 0.7428039346413163,
+      "kl": 0.47802734375,
+      "learning_rate": 5.20644454585262e-08,
+      "loss": 0.0005,
+      "num_tokens": 67376314.0,
+      "reward": 0.51171875,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.600888888888889,
+      "grad_norm": 0.47757983506186286,
+      "kl": 0.44580078125,
+      "learning_rate": 5.1187436696716906e-08,
+      "loss": 0.0004,
+      "num_tokens": 67526314.0,
+      "reward": 0.52734375,
+      "reward_std": 0.030584799125790596,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6044444444444443,
+      "grad_norm": 0.6183437552202213,
+      "kl": 0.5078125,
+      "learning_rate": 5.0317034099090524e-08,
+      "loss": 0.0005,
+      "num_tokens": 67676354.0,
+      "reward": 0.52734375,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.608,
+      "grad_norm": 0.584260562502066,
+      "kl": 0.486083984375,
+      "learning_rate": 4.9453266587473423e-08,
+      "loss": 0.0005,
+      "num_tokens": 67826470.0,
+      "reward": 0.5546875,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6115555555555554,
+      "grad_norm": 0.536997119691752,
+      "kl": 0.43310546875,
+      "learning_rate": 4.859616286322094e-08,
+      "loss": 0.0004,
+      "num_tokens": 67976526.0,
+      "reward": 0.5546875,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6151111111111112,
+      "grad_norm": 0.4285196702276041,
+      "kl": 0.413818359375,
+      "learning_rate": 4.774575140626316e-08,
+      "loss": 0.0004,
+      "num_tokens": 68126606.0,
+      "reward": 0.55859375,
+      "reward_std": 0.025854695588350296,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6186666666666667,
+      "grad_norm": 0.6432882809805262,
+      "kl": 0.40478515625,
+      "learning_rate": 4.6902060474159036e-08,
+      "loss": 0.0004,
+      "num_tokens": 68276658.0,
+      "reward": 0.59375,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6222222222222222,
+      "grad_norm": 0.670105632966217,
+      "kl": 0.43212890625,
+      "learning_rate": 4.6065118101157016e-08,
+      "loss": 0.0004,
+      "num_tokens": 68426750.0,
+      "reward": 0.5078125,
+      "reward_std": 0.06204995512962341,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6257777777777778,
+      "grad_norm": 0.7554410323140208,
+      "kl": 0.504150390625,
+      "learning_rate": 4.5234952097263965e-08,
+      "loss": 0.0005,
+      "num_tokens": 68576790.0,
+      "reward": 0.5625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6293333333333333,
+      "grad_norm": 0.5383642917248337,
+      "kl": 0.489013671875,
+      "learning_rate": 4.4411590047320617e-08,
+      "loss": 0.0005,
+      "num_tokens": 68726862.0,
+      "reward": 0.5625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6328888888888888,
+      "grad_norm": 0.6764051889067699,
+      "kl": 0.48876953125,
+      "learning_rate": 4.359505931008553e-08,
+      "loss": 0.0005,
+      "num_tokens": 68877026.0,
+      "reward": 0.5546875,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6364444444444444,
+      "grad_norm": 0.608366482588541,
+      "kl": 0.46923828125,
+      "learning_rate": 4.278538701732534e-08,
+      "loss": 0.0005,
+      "num_tokens": 69027150.0,
+      "reward": 0.546875,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.6650025403400517,
+      "kl": 0.48193359375,
+      "learning_rate": 4.198260007291399e-08,
+      "loss": 0.0005,
+      "num_tokens": 69177238.0,
+      "reward": 0.5,
+      "reward_std": 0.0625,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6435555555555554,
+      "grad_norm": 0.4721462056014319,
+      "kl": 0.51953125,
+      "learning_rate": 4.118672515193794e-08,
+      "loss": 0.0005,
+      "num_tokens": 69327298.0,
+      "reward": 0.5390625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6471111111111112,
+      "grad_norm": 0.6621926942392276,
+      "kl": 0.495361328125,
+      "learning_rate": 4.039778869981064e-08,
+      "loss": 0.0005,
+      "num_tokens": 69477314.0,
+      "reward": 0.546875,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6506666666666665,
+      "grad_norm": 0.7308447153300471,
+      "kl": 0.541748046875,
+      "learning_rate": 3.961581693139307e-08,
+      "loss": 0.0005,
+      "num_tokens": 69627398.0,
+      "reward": 0.55859375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6542222222222223,
+      "grad_norm": 0.847646129412192,
+      "kl": 0.5849609375,
+      "learning_rate": 3.884083583012318e-08,
+      "loss": 0.0006,
+      "num_tokens": 69777462.0,
+      "reward": 0.58203125,
+      "reward_std": 0.0703125,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6577777777777778,
+      "grad_norm": 0.5867478692263943,
+      "kl": 0.453857421875,
+      "learning_rate": 3.807287114715216e-08,
+      "loss": 0.0005,
+      "num_tokens": 69927534.0,
+      "reward": 0.546875,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6613333333333333,
+      "grad_norm": 0.5569296548464157,
+      "kl": 0.53515625,
+      "learning_rate": 3.731194840048915e-08,
+      "loss": 0.0005,
+      "num_tokens": 70077530.0,
+      "reward": 0.58203125,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6648888888888889,
+      "grad_norm": 0.47294248502288805,
+      "kl": 0.427490234375,
+      "learning_rate": 3.655809287415284e-08,
+      "loss": 0.0004,
+      "num_tokens": 70227594.0,
+      "reward": 0.5234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6684444444444444,
+      "grad_norm": 0.4574981255397005,
+      "kl": 0.4287109375,
+      "learning_rate": 3.581132961733191e-08,
+      "loss": 0.0004,
+      "num_tokens": 70377714.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6720000000000002,
+      "grad_norm": 0.40519803603997345,
+      "kl": 0.4169921875,
+      "learning_rate": 3.5071683443552045e-08,
+      "loss": 0.0004,
+      "num_tokens": 70527870.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6755555555555555,
+      "grad_norm": 0.4840050767135489,
+      "kl": 0.47607421875,
+      "learning_rate": 3.433917892985208e-08,
+      "loss": 0.0005,
+      "num_tokens": 70677974.0,
+      "reward": 0.4921875,
+      "reward_std": 0.028382759541273117,
+      "rewards/equation_reward_func/mean": 0.0078125,
+      "rewards/equation_reward_func/std": 0.0883883461356163,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6791111111111112,
+      "grad_norm": 0.5689276035825345,
+      "kl": 0.447509765625,
+      "learning_rate": 3.3613840415966764e-08,
+      "loss": 0.0004,
+      "num_tokens": 70828062.0,
+      "reward": 0.5078125,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6826666666666665,
+      "grad_norm": 0.3906743686263679,
+      "kl": 0.39892578125,
+      "learning_rate": 3.2895692003518575e-08,
+      "loss": 0.0004,
+      "num_tokens": 70978234.0,
+      "reward": 0.48828125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6862222222222223,
+      "grad_norm": 0.6319095516033488,
+      "kl": 0.4541015625,
+      "learning_rate": 3.218475755521621e-08,
+      "loss": 0.0005,
+      "num_tokens": 71128366.0,
+      "reward": 0.55859375,
+      "reward_std": 0.05710469186306,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6897777777777778,
+      "grad_norm": 0.6527920446086392,
+      "kl": 0.404541015625,
+      "learning_rate": 3.1481060694062365e-08,
+      "loss": 0.0004,
+      "num_tokens": 71278446.0,
+      "reward": 0.53515625,
+      "reward_std": 0.06370859593153,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.6933333333333334,
+      "grad_norm": 0.609657629006971,
+      "kl": 0.41162109375,
+      "learning_rate": 3.078462480256819e-08,
+      "loss": 0.0004,
+      "num_tokens": 71428610.0,
+      "reward": 0.53125,
+      "reward_std": 0.058313291519880295,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.696888888888889,
+      "grad_norm": 0.8910468177123286,
+      "kl": 0.513427734375,
+      "learning_rate": 3.0095473021976794e-08,
+      "loss": 0.0005,
+      "num_tokens": 71578734.0,
+      "reward": 0.5078125,
+      "reward_std": 0.07152109593153,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7004444444444444,
+      "grad_norm": 0.8425501635040524,
+      "kl": 0.51416015625,
+      "learning_rate": 2.9413628251493934e-08,
+      "loss": 0.0005,
+      "num_tokens": 71728878.0,
+      "reward": 0.55078125,
+      "reward_std": 0.09077189117670059,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.704,
+      "grad_norm": 0.6418930440563599,
+      "kl": 0.430908203125,
+      "learning_rate": 2.8739113147527417e-08,
+      "loss": 0.0004,
+      "num_tokens": 71878958.0,
+      "reward": 0.5703125,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.15625,
+      "rewards/equation_reward_func/std": 0.3645188808441162,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7075555555555555,
+      "grad_norm": 1.0250618122706754,
+      "kl": 0.630859375,
+      "learning_rate": 2.8071950122934036e-08,
+      "loss": 0.0006,
+      "num_tokens": 72029110.0,
+      "reward": 0.53515625,
+      "reward_std": 0.07085589319467545,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7111111111111112,
+      "grad_norm": 0.6070267863410984,
+      "kl": 0.4892578125,
+      "learning_rate": 2.7412161346275052e-08,
+      "loss": 0.0005,
+      "num_tokens": 72179178.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7146666666666666,
+      "grad_norm": 0.41436971147539053,
+      "kl": 0.473388671875,
+      "learning_rate": 2.675976874107935e-08,
+      "loss": 0.0005,
+      "num_tokens": 72329226.0,
+      "reward": 0.51171875,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7182222222222223,
+      "grad_norm": 0.7994525127067181,
+      "kl": 0.623046875,
+      "learning_rate": 2.611479398511518e-08,
+      "loss": 0.0006,
+      "num_tokens": 72479326.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7217777777777776,
+      "grad_norm": 3.591555794980261,
+      "kl": 0.92333984375,
+      "learning_rate": 2.5477258509669614e-08,
+      "loss": 0.0009,
+      "num_tokens": 72629430.0,
+      "reward": 0.55859375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7253333333333334,
+      "grad_norm": 0.5523457619977477,
+      "kl": 0.407958984375,
+      "learning_rate": 2.4847183498836714e-08,
+      "loss": 0.0004,
+      "num_tokens": 72779562.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.728888888888889,
+      "grad_norm": 0.5337513777324062,
+      "kl": 0.489013671875,
+      "learning_rate": 2.4224589888813263e-08,
+      "loss": 0.0005,
+      "num_tokens": 72929630.0,
+      "reward": 0.49609375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7324444444444445,
+      "grad_norm": 3.336975111696118,
+      "kl": 0.9912109375,
+      "learning_rate": 2.3609498367203467e-08,
+      "loss": 0.001,
+      "num_tokens": 73079714.0,
+      "reward": 0.49609375,
+      "reward_std": 0.06744526326656342,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.736,
+      "grad_norm": 0.37339089559378696,
+      "kl": 0.4267578125,
+      "learning_rate": 2.300192937233128e-08,
+      "loss": 0.0004,
+      "num_tokens": 73229834.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7395555555555555,
+      "grad_norm": 0.5184832733671315,
+      "kl": 0.509033203125,
+      "learning_rate": 2.240190309256143e-08,
+      "loss": 0.0005,
+      "num_tokens": 73379898.0,
+      "reward": 0.5,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0234375,
+      "rewards/equation_reward_func/std": 0.15188287198543549,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.743111111111111,
+      "grad_norm": 0.937777108795673,
+      "kl": 0.7412109375,
+      "learning_rate": 2.1809439465628382e-08,
+      "loss": 0.0007,
+      "num_tokens": 73530006.0,
+      "reward": 0.5625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7466666666666666,
+      "grad_norm": 0.44792777397212485,
+      "kl": 0.432373046875,
+      "learning_rate": 2.122455817797428e-08,
+      "loss": 0.0004,
+      "num_tokens": 73680138.0,
+      "reward": 0.50390625,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.015625,
+      "rewards/equation_reward_func/std": 0.12450689822435379,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7502222222222223,
+      "grad_norm": 1.8165365556826047,
+      "kl": 1.04248046875,
+      "learning_rate": 2.0647278664094188e-08,
+      "loss": 0.001,
+      "num_tokens": 73830170.0,
+      "reward": 0.53125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7537777777777777,
+      "grad_norm": 0.5746953789475809,
+      "kl": 0.496337890625,
+      "learning_rate": 2.007762010589098e-08,
+      "loss": 0.0005,
+      "num_tokens": 73980250.0,
+      "reward": 0.515625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9375,
+      "rewards/format_reward_func/std": 0.24301259219646454,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7573333333333334,
+      "grad_norm": 34.458999462219055,
+      "kl": 9.58984375,
+      "learning_rate": 1.9515601432037317e-08,
+      "loss": 0.0097,
+      "num_tokens": 74130306.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7608888888888887,
+      "grad_norm": 1.1445300736653963,
+      "kl": 0.626953125,
+      "learning_rate": 1.8961241317347333e-08,
+      "loss": 0.0006,
+      "num_tokens": 74280438.0,
+      "reward": 0.515625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7644444444444445,
+      "grad_norm": 0.530745335656726,
+      "kl": 0.537109375,
+      "learning_rate": 1.8414558182155456e-08,
+      "loss": 0.0005,
+      "num_tokens": 74430474.0,
+      "reward": 0.50390625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.768,
+      "grad_norm": 4.325748345041621,
+      "kl": 0.811767578125,
+      "learning_rate": 1.787557019170488e-08,
+      "loss": 0.0008,
+      "num_tokens": 74580578.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7715555555555556,
+      "grad_norm": 0.6871408255583382,
+      "kl": 0.51953125,
+      "learning_rate": 1.734429525554365e-08,
+      "loss": 0.0005,
+      "num_tokens": 74730626.0,
+      "reward": 0.4921875,
+      "reward_std": 0.04929219186306,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.775111111111111,
+      "grad_norm": 0.5276217136164556,
+      "kl": 0.505859375,
+      "learning_rate": 1.6820751026929674e-08,
+      "loss": 0.0005,
+      "num_tokens": 74880714.0,
+      "reward": 0.5,
+      "reward_std": 0.04642495512962341,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7786666666666666,
+      "grad_norm": 0.6364230686117014,
+      "kl": 0.4501953125,
+      "learning_rate": 1.6304954902244095e-08,
+      "loss": 0.0005,
+      "num_tokens": 75030838.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7822222222222224,
+      "grad_norm": 0.45223361174699134,
+      "kl": 0.44482421875,
+      "learning_rate": 1.5796924020413327e-08,
+      "loss": 0.0004,
+      "num_tokens": 75180934.0,
+      "reward": 0.578125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7857777777777777,
+      "grad_norm": 3.248583973869253,
+      "kl": 0.61669921875,
+      "learning_rate": 1.529667526233941e-08,
+      "loss": 0.0006,
+      "num_tokens": 75330966.0,
+      "reward": 0.57421875,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7893333333333334,
+      "grad_norm": 0.4374604603350181,
+      "kl": 0.434326171875,
+      "learning_rate": 1.4804225250339281e-08,
+      "loss": 0.0004,
+      "num_tokens": 75481026.0,
+      "reward": 0.50390625,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7928888888888888,
+      "grad_norm": 3.0992385739650823,
+      "kl": 0.75537109375,
+      "learning_rate": 1.4319590347592254e-08,
+      "loss": 0.0008,
+      "num_tokens": 75631058.0,
+      "reward": 0.546875,
+      "reward_std": 0.06304339319467545,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.7964444444444445,
+      "grad_norm": 0.652594553880175,
+      "kl": 0.421142578125,
+      "learning_rate": 1.3842786657596446e-08,
+      "loss": 0.0004,
+      "num_tokens": 75781122.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8,
+      "grad_norm": 0.765818438315625,
+      "kl": 0.432373046875,
+      "learning_rate": 1.3373830023633597e-08,
+      "loss": 0.0004,
+      "num_tokens": 75931254.0,
+      "reward": 0.515625,
+      "reward_std": 0.08956328779459,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9296875,
+      "rewards/format_reward_func/std": 0.2566775679588318,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8035555555555556,
+      "grad_norm": 0.3411460860044164,
+      "kl": 0.439453125,
+      "learning_rate": 1.2912736028242777e-08,
+      "loss": 0.0004,
+      "num_tokens": 76081346.0,
+      "reward": 0.5390625,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8071111111111111,
+      "grad_norm": 0.7582340748491105,
+      "kl": 0.58740234375,
+      "learning_rate": 1.2459519992702311e-08,
+      "loss": 0.0006,
+      "num_tokens": 76231422.0,
+      "reward": 0.515625,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8106666666666666,
+      "grad_norm": 0.6931442469315353,
+      "kl": 0.43994140625,
+      "learning_rate": 1.2014196976521035e-08,
+      "loss": 0.0004,
+      "num_tokens": 76381442.0,
+      "reward": 0.578125,
+      "reward_std": 0.07206448912620544,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8142222222222222,
+      "grad_norm": 0.6242469027810024,
+      "kl": 0.418701171875,
+      "learning_rate": 1.1576781776937634e-08,
+      "loss": 0.0004,
+      "num_tokens": 76531578.0,
+      "reward": 0.53125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8177777777777777,
+      "grad_norm": 0.5319993004285838,
+      "kl": 0.42333984375,
+      "learning_rate": 1.1147288928429116e-08,
+      "loss": 0.0004,
+      "num_tokens": 76681742.0,
+      "reward": 0.53125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0859375,
+      "rewards/equation_reward_func/std": 0.2813730239868164,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8213333333333335,
+      "grad_norm": 4.892495144239733,
+      "kl": 1.937744140625,
+      "learning_rate": 1.0725732702227735e-08,
+      "loss": 0.0019,
+      "num_tokens": 76831858.0,
+      "reward": 0.515625,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 512
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8248888888888888,
+      "grad_norm": 1.0196354843303994,
+      "kl": 0.447998046875,
+      "learning_rate": 1.0312127105846947e-08,
+      "loss": 0.0004,
+      "num_tokens": 76981942.0,
+      "reward": 0.5546875,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.1328125,
+      "rewards/equation_reward_func/std": 0.3407054841518402,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8284444444444445,
+      "grad_norm": 0.7205299980402574,
+      "kl": 0.420654296875,
+      "learning_rate": 9.906485882615695e-09,
+      "loss": 0.0004,
+      "num_tokens": 77132030.0,
+      "reward": 0.51953125,
+      "reward_std": 0.057104695588350296,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 514
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8319999999999999,
+      "grad_norm": 4.0685262934583095,
+      "kl": 1.60107421875,
+      "learning_rate": 9.50882251122212e-09,
+      "loss": 0.0016,
+      "num_tokens": 77282094.0,
+      "reward": 0.52734375,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 515
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8355555555555556,
+      "grad_norm": 0.627281356606571,
+      "kl": 0.442626953125,
+      "learning_rate": 9.119150205265324e-09,
+      "loss": 0.0004,
+      "num_tokens": 77432230.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0546875,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8391111111111111,
+      "grad_norm": 0.3858806846831556,
+      "kl": 0.45751953125,
+      "learning_rate": 8.737481912816592e-09,
+      "loss": 0.0005,
+      "num_tokens": 77582310.0,
+      "reward": 0.58984375,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.1953125,
+      "rewards/equation_reward_func/std": 0.3979988098144531,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 517
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8426666666666667,
+      "grad_norm": 0.23991137840603588,
+      "kl": 0.4140625,
+      "learning_rate": 8.363830315988945e-09,
+      "loss": 0.0004,
+      "num_tokens": 77732286.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 518
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8462222222222222,
+      "grad_norm": 0.4740890347365351,
+      "kl": 0.447998046875,
+      "learning_rate": 7.99820783051583e-09,
+      "loss": 0.0004,
+      "num_tokens": 77882334.0,
+      "reward": 0.56640625,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 519
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8497777777777777,
+      "grad_norm": 0.5700442254866067,
+      "kl": 0.484375,
+      "learning_rate": 7.640626605338624e-09,
+      "loss": 0.0005,
+      "num_tokens": 78032414.0,
+      "reward": 0.515625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 520
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8533333333333335,
+      "grad_norm": 0.41681350867652445,
+      "kl": 0.423095703125,
+      "learning_rate": 7.291098522202776e-09,
+      "loss": 0.0004,
+      "num_tokens": 78182498.0,
+      "reward": 0.546875,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 521
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8568888888888888,
+      "grad_norm": 0.7337724383393545,
+      "kl": 0.47021484375,
+      "learning_rate": 6.949635195263259e-09,
+      "loss": 0.0005,
+      "num_tokens": 78332678.0,
+      "reward": 0.546875,
+      "reward_std": 0.058313291519880295,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8604444444444446,
+      "grad_norm": 0.4627777960335074,
+      "kl": 0.410888671875,
+      "learning_rate": 6.616247970698319e-09,
+      "loss": 0.0004,
+      "num_tokens": 78482726.0,
+      "reward": 0.58203125,
+      "reward_std": 0.04808359593153,
+      "rewards/equation_reward_func/mean": 0.1875,
+      "rewards/equation_reward_func/std": 0.39184603095054626,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 523
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8639999999999999,
+      "grad_norm": 0.9327871091407389,
+      "kl": 0.59765625,
+      "learning_rate": 6.290947926332835e-09,
+      "loss": 0.0006,
+      "num_tokens": 78632766.0,
+      "reward": 0.5234375,
+      "reward_std": 0.07152109593153,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9453125,
+      "rewards/format_reward_func/std": 0.22826264798641205,
+      "step": 524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8675555555555556,
+      "grad_norm": 0.5684611750891795,
+      "kl": 0.43798828125,
+      "learning_rate": 5.97374587126992e-09,
+      "loss": 0.0004,
+      "num_tokens": 78782810.0,
+      "reward": 0.5390625,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 525
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.871111111111111,
+      "grad_norm": 0.41112080183218475,
+      "kl": 0.45068359375,
+      "learning_rate": 5.664652345531845e-09,
+      "loss": 0.0005,
+      "num_tokens": 78932894.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8746666666666667,
+      "grad_norm": 0.6226854601113873,
+      "kl": 0.4345703125,
+      "learning_rate": 5.363677619709933e-09,
+      "loss": 0.0004,
+      "num_tokens": 79082994.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0390625,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 527
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8782222222222222,
+      "grad_norm": 0.40926220755087916,
+      "kl": 0.460205078125,
+      "learning_rate": 5.070831694623135e-09,
+      "loss": 0.0005,
+      "num_tokens": 79233062.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8817777777777778,
+      "grad_norm": 0.3715644209620836,
+      "kl": 0.443359375,
+      "learning_rate": 4.786124300985822e-09,
+      "loss": 0.0004,
+      "num_tokens": 79383142.0,
+      "reward": 0.5859375,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 529
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8853333333333333,
+      "grad_norm": 0.41303999956467335,
+      "kl": 0.425048828125,
+      "learning_rate": 4.509564899084328e-09,
+      "loss": 0.0004,
+      "num_tokens": 79533302.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 530
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.34711935286318774,
+      "kl": 0.502685546875,
+      "learning_rate": 4.241162678462806e-09,
+      "loss": 0.0005,
+      "num_tokens": 79683382.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8924444444444446,
+      "grad_norm": 1.749104726801777,
+      "kl": 0.57666015625,
+      "learning_rate": 3.9809265576176146e-09,
+      "loss": 0.0006,
+      "num_tokens": 79833494.0,
+      "reward": 0.5546875,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.896,
+      "grad_norm": 0.317239752869522,
+      "kl": 0.460693359375,
+      "learning_rate": 3.7288651837012745e-09,
+      "loss": 0.0005,
+      "num_tokens": 79983606.0,
+      "reward": 0.5234375,
+      "reward_std": 0.009021097794175148,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.8995555555555557,
+      "grad_norm": 0.5981545675620618,
+      "kl": 0.4501953125,
+      "learning_rate": 3.4849869322348126e-09,
+      "loss": 0.0005,
+      "num_tokens": 80133666.0,
+      "reward": 0.5234375,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 534
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.903111111111111,
+      "grad_norm": 1.8987553387497587,
+      "kl": 0.595703125,
+      "learning_rate": 3.249299906829761e-09,
+      "loss": 0.0006,
+      "num_tokens": 80283778.0,
+      "reward": 0.5234375,
+      "reward_std": 0.037403859198093414,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9066666666666667,
+      "grad_norm": 1.3482962026459895,
+      "kl": 0.652587890625,
+      "learning_rate": 3.0218119389186502e-09,
+      "loss": 0.0007,
+      "num_tokens": 80433874.0,
+      "reward": 0.57421875,
+      "reward_std": 0.04620979726314545,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9102222222222223,
+      "grad_norm": 0.29399766813544437,
+      "kl": 0.393798828125,
+      "learning_rate": 2.8025305874949945e-09,
+      "loss": 0.0004,
+      "num_tokens": 80583958.0,
+      "reward": 0.58203125,
+      "reward_std": 0.016833597794175148,
+      "rewards/equation_reward_func/mean": 0.171875,
+      "rewards/equation_reward_func/std": 0.3787541687488556,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9137777777777778,
+      "grad_norm": 0.5162389083983054,
+      "kl": 0.40966796875,
+      "learning_rate": 2.5914631388619103e-09,
+      "loss": 0.0004,
+      "num_tokens": 80734106.0,
+      "reward": 0.5390625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9173333333333333,
+      "grad_norm": 0.5828591533810914,
+      "kl": 0.423583984375,
+      "learning_rate": 2.388616606390198e-09,
+      "loss": 0.0004,
+      "num_tokens": 80884218.0,
+      "reward": 0.5234375,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.078125,
+      "rewards/equation_reward_func/std": 0.2694226801395416,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 539
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9208888888888889,
+      "grad_norm": 0.5133961922979738,
+      "kl": 0.511962890625,
+      "learning_rate": 2.193997730285141e-09,
+      "loss": 0.0005,
+      "num_tokens": 81034370.0,
+      "reward": 0.5390625,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 540
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9244444444444444,
+      "grad_norm": 0.6682775620951583,
+      "kl": 0.51123046875,
+      "learning_rate": 2.0076129773627103e-09,
+      "loss": 0.0005,
+      "num_tokens": 81184450.0,
+      "reward": 0.5,
+      "reward_std": 0.05589609593153,
+      "rewards/equation_reward_func/mean": 0.03125,
+      "rewards/equation_reward_func/std": 0.1746762990951538,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 541
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.928,
+      "grad_norm": 0.58701555865206,
+      "kl": 0.44287109375,
+      "learning_rate": 1.8294685408345167e-09,
+      "loss": 0.0004,
+      "num_tokens": 81334506.0,
+      "reward": 0.5625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.140625,
+      "rewards/equation_reward_func/std": 0.3490002751350403,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 542
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9315555555555557,
+      "grad_norm": 0.5226655218446554,
+      "kl": 0.406005859375,
+      "learning_rate": 1.6595703401020844e-09,
+      "loss": 0.0004,
+      "num_tokens": 81484590.0,
+      "reward": 0.55078125,
+      "reward_std": 0.05050079524517059,
+      "rewards/equation_reward_func/mean": 0.1171875,
+      "rewards/equation_reward_func/std": 0.322907418012619,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 543
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.935111111111111,
+      "grad_norm": 0.44785902127312094,
+      "kl": 0.45068359375,
+      "learning_rate": 1.497924020560204e-09,
+      "loss": 0.0005,
+      "num_tokens": 81634682.0,
+      "reward": 0.51171875,
+      "reward_std": 0.030584799125790596,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 544
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9386666666666668,
+      "grad_norm": 1.2271943789655686,
+      "kl": 0.521728515625,
+      "learning_rate": 1.3445349534093598e-09,
+      "loss": 0.0005,
+      "num_tokens": 81784818.0,
+      "reward": 0.53515625,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 545
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.942222222222222,
+      "grad_norm": 0.34440149179608015,
+      "kl": 0.428955078125,
+      "learning_rate": 1.199408235477123e-09,
+      "loss": 0.0004,
+      "num_tokens": 81934862.0,
+      "reward": 0.5390625,
+      "reward_std": 0.024646097794175148,
+      "rewards/equation_reward_func/mean": 0.1015625,
+      "rewards/equation_reward_func/std": 0.3032590448856354,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9457777777777778,
+      "grad_norm": 0.5299180419650383,
+      "kl": 0.398681640625,
+      "learning_rate": 1.0625486890488978e-09,
+      "loss": 0.0004,
+      "num_tokens": 82084958.0,
+      "reward": 0.5078125,
+      "reward_std": 0.04027109593153,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 547
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9493333333333334,
+      "grad_norm": 0.7383076800700398,
+      "kl": 0.576171875,
+      "learning_rate": 9.339608617077165e-10,
+      "loss": 0.0006,
+      "num_tokens": 82234994.0,
+      "reward": 0.515625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.0703125,
+      "rewards/equation_reward_func/std": 0.2566775679588318,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.952888888888889,
+      "grad_norm": 2.413667288202016,
+      "kl": 0.955322265625,
+      "learning_rate": 8.136490261830553e-10,
+      "loss": 0.001,
+      "num_tokens": 82385046.0,
+      "reward": 0.5078125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.046875,
+      "rewards/equation_reward_func/std": 0.21220162510871887,
+      "rewards/format_reward_func/mean": 0.96875,
+      "rewards/format_reward_func/std": 0.1746762990951538,
+      "step": 549
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9564444444444444,
+      "grad_norm": 0.621725268878221,
+      "kl": 0.462890625,
+      "learning_rate": 7.016171802088633e-10,
+      "loss": 0.0005,
+      "num_tokens": 82535218.0,
+      "reward": 0.5390625,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 550
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.96,
+      "grad_norm": 0.23655919159399819,
+      "kl": 0.40673828125,
+      "learning_rate": 5.978690463908087e-10,
+      "loss": 0.0004,
+      "num_tokens": 82685354.0,
+      "reward": 0.55859375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.125,
+      "rewards/equation_reward_func/std": 0.3320184051990509,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9635555555555557,
+      "grad_norm": 0.47785457137586707,
+      "kl": 0.448974609375,
+      "learning_rate": 5.024080720824608e-10,
+      "loss": 0.0004,
+      "num_tokens": 82835570.0,
+      "reward": 0.51953125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.967111111111111,
+      "grad_norm": 0.346206288127188,
+      "kl": 0.446533203125,
+      "learning_rate": 4.152374292708538e-10,
+      "loss": 0.0004,
+      "num_tokens": 82985646.0,
+      "reward": 0.51953125,
+      "reward_std": 0.025854695588350296,
+      "rewards/equation_reward_func/mean": 0.0390625,
+      "rewards/equation_reward_func/std": 0.194504976272583,
+      "rewards/format_reward_func/mean": 1.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 553
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9706666666666668,
+      "grad_norm": 0.3832712574599466,
+      "kl": 0.40869140625,
+      "learning_rate": 3.363600144710155e-10,
+      "loss": 0.0004,
+      "num_tokens": 83135678.0,
+      "reward": 0.55078125,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 554
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.974222222222222,
+      "grad_norm": 0.24281220847577703,
+      "kl": 0.391357421875,
+      "learning_rate": 2.6577844862973877e-10,
+      "loss": 0.0004,
+      "num_tokens": 83285698.0,
+      "reward": 0.52734375,
+      "reward_std": 0.0078125,
+      "rewards/equation_reward_func/mean": 0.0625,
+      "rewards/equation_reward_func/std": 0.24301259219646454,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9777777777777779,
+      "grad_norm": 0.9582617582460151,
+      "kl": 0.546142578125,
+      "learning_rate": 2.0349507703851243e-10,
+      "loss": 0.0005,
+      "num_tokens": 83435778.0,
+      "reward": 0.57421875,
+      "reward_std": 0.03245859593153,
+      "rewards/equation_reward_func/mean": 0.1640625,
+      "rewards/equation_reward_func/std": 0.371787428855896,
+      "rewards/format_reward_func/mean": 0.984375,
+      "rewards/format_reward_func/std": 0.12450689822435379,
+      "step": 556
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9813333333333332,
+      "grad_norm": 0.35323783838354617,
+      "kl": 0.42431640625,
+      "learning_rate": 1.4951196925561127e-10,
+      "loss": 0.0004,
+      "num_tokens": 83585798.0,
+      "reward": 0.54296875,
+      "reward_std": 0.0234375,
+      "rewards/equation_reward_func/mean": 0.09375,
+      "rewards/equation_reward_func/std": 0.29262590408325195,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 557
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.984888888888889,
+      "grad_norm": 1.4092624453276854,
+      "kl": 0.759521484375,
+      "learning_rate": 1.0383091903720665e-10,
+      "loss": 0.0008,
+      "num_tokens": 83735938.0,
+      "reward": 0.5625,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.1484375,
+      "rewards/equation_reward_func/std": 0.356930136680603,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9884444444444445,
+      "grad_norm": 0.7220556700564954,
+      "kl": 0.511474609375,
+      "learning_rate": 6.645344427794186e-11,
+      "loss": 0.0005,
+      "num_tokens": 83886042.0,
+      "reward": 0.53125,
+      "reward_std": 0.049292195588350296,
+      "rewards/equation_reward_func/mean": 0.109375,
+      "rewards/equation_reward_func/std": 0.31333550810813904,
+      "rewards/format_reward_func/mean": 0.953125,
+      "rewards/format_reward_func/std": 0.21220162510871887,
+      "step": 559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.992,
+      "grad_norm": 0.3915357093577265,
+      "kl": 0.454833984375,
+      "learning_rate": 3.738078696036151e-11,
+      "loss": 0.0005,
+      "num_tokens": 84036218.0,
+      "reward": 0.5234375,
+      "reward_std": 0.015625,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9921875,
+      "rewards/format_reward_func/std": 0.0883883461356163,
+      "step": 560
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.9955555555555555,
+      "grad_norm": 0.4708155060402308,
+      "kl": 0.412353515625,
+      "learning_rate": 1.6613913113694423e-11,
+      "loss": 0.0004,
+      "num_tokens": 84186286.0,
+      "reward": 0.578125,
+      "reward_std": 0.03125,
+      "rewards/equation_reward_func/mean": 0.1796875,
+      "rewards/equation_reward_func/std": 0.3854354918003082,
+      "rewards/format_reward_func/mean": 0.9765625,
+      "rewards/format_reward_func/std": 0.15188287198543549,
+      "step": 561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.999111111111111,
+      "grad_norm": 0.7953265607446423,
+      "kl": 0.6337890625,
+      "learning_rate": 4.153512781768231e-12,
+      "loss": 0.0006,
+      "num_tokens": 84336314.0,
+      "reward": 0.5078125,
+      "reward_std": 0.046875,
+      "rewards/equation_reward_func/mean": 0.0546875,
+      "rewards/equation_reward_func/std": 0.22826264798641205,
+      "rewards/format_reward_func/mean": 0.9609375,
+      "rewards/format_reward_func/std": 0.194504976272583,
+      "step": 562
+    },
+    {
+      "epoch": 1.999111111111111,
+      "step": 562,
+      "total_flos": 0.0,
+      "train_loss": 0.010520504666011201,
+      "train_runtime": 15057.2865,
+      "train_samples_per_second": 1.195,
+      "train_steps_per_second": 0.037
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 562,
+  "num_input_tokens_seen": 84336314,
+  "num_train_epochs": 2,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}