diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,3394 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.56,
+  "eval_steps": 500,
+  "global_step": 140,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1261.0,
+      "completions/max_terminated_length": 1261.0,
+      "completions/mean_length": 411.5,
+      "completions/mean_terminated_length": 470.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 0.004,
+      "format_failures": 0.0,
+      "grad_norm": 0.3164481222629547,
+      "kl": 0.0,
+      "learning_rate": 0.0,
+      "loss": 0.0574,
+      "num_tokens": 20912.0,
+      "reward": 0.10000000149011612,
+      "reward_std": 0.19272480905056,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 125.0,
+      "completions/max_terminated_length": 125.0,
+      "completions/mean_length": 93.625,
+      "completions/mean_terminated_length": 107.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.008,
+      "format_failures": 0.0,
+      "grad_norm": 3.300063133239746,
+      "kl": 0.0,
+      "learning_rate": 1e-06,
+      "loss": -0.032,
+      "num_tokens": 28472.0,
+      "reward": 0.5,
+      "reward_std": 0.5345224738121033,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 228.0,
+      "completions/max_terminated_length": 228.0,
+      "completions/mean_length": 170.375,
+      "completions/mean_terminated_length": 194.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 0.012,
+      "format_failures": 0.0,
+      "grad_norm": 0.426563024520874,
+      "kl": 0.19075269997119904,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 38272.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 302.0,
+      "completions/max_terminated_length": 302.0,
+      "completions/mean_length": 215.75,
+      "completions/mean_terminated_length": 246.57142857142858,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.016,
+      "format_failures": 1.0,
+      "grad_norm": 0.3638526201248169,
+      "kl": 0.0030522841261699796,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 44880.0,
+      "reward": 0.17291666567325592,
+      "reward_std": 0.16665178537368774,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 129.0,
+      "completions/max_terminated_length": 129.0,
+      "completions/mean_length": 88.625,
+      "completions/mean_terminated_length": 101.28571428571429,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 66.0,
+      "epoch": 0.02,
+      "format_failures": 1.0,
+      "grad_norm": 12.54277515411377,
+      "kl": 1.5523776412010193,
+      "learning_rate": 1e-06,
+      "loss": 0.0192,
+      "num_tokens": 54104.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 388.0,
+      "completions/max_terminated_length": 388.0,
+      "completions/mean_length": 271.75,
+      "completions/mean_terminated_length": 310.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.024,
+      "format_failures": 0.0,
+      "grad_norm": 0.432476669549942,
+      "kl": 0.0021984531776979566,
+      "learning_rate": 1e-06,
+      "loss": -0.088,
+      "num_tokens": 66888.0,
+      "reward": 0.2569444477558136,
+      "reward_std": 0.27688348293304443,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 200.0,
+      "completions/max_terminated_length": 200.0,
+      "completions/mean_length": 82.625,
+      "completions/mean_terminated_length": 94.42857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 48.0,
+      "epoch": 0.028,
+      "format_failures": 0.0,
+      "grad_norm": 0.0007834668504074216,
+      "kl": 0.000487034791149199,
+      "learning_rate": 1e-06,
+      "loss": 0.0,
+      "num_tokens": 87976.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 182.0,
+      "completions/max_terminated_length": 182.0,
+      "completions/mean_length": 111.75,
+      "completions/mean_terminated_length": 127.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 94.0,
+      "epoch": 0.032,
+      "format_failures": 0.0,
+      "grad_norm": 0.2904910445213318,
+      "kl": 0.0784255713224411,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 97376.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 643.0,
+      "completions/max_terminated_length": 643.0,
+      "completions/mean_length": 297.25,
+      "completions/mean_terminated_length": 339.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.036,
+      "format_failures": 0.0,
+      "grad_norm": 0.5291862487792969,
+      "kl": 0.006049621384590864,
+      "learning_rate": 1e-06,
+      "loss": 0.046,
+      "num_tokens": 110264.0,
+      "reward": 0.2834821343421936,
+      "reward_std": 0.3961408734321594,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 62.0,
+      "completions/max_terminated_length": 62.0,
+      "completions/mean_length": 37.625,
+      "completions/mean_terminated_length": 50.166666666666664,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.04,
+      "format_failures": 0.0,
+      "grad_norm": 1.7151610851287842,
+      "kl": 0.2360311597585678,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 115504.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 154.0,
+      "completions/max_terminated_length": 154.0,
+      "completions/mean_length": 109.5,
+      "completions/mean_terminated_length": 125.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 62.0,
+      "epoch": 0.044,
+      "format_failures": 0.0,
+      "grad_norm": 1.5465294122695923,
+      "kl": 0.01262557739391923,
+      "learning_rate": 1e-06,
+      "loss": 0.1145,
+      "num_tokens": 125936.0,
+      "reward": 0.6499999761581421,
+      "reward_std": 0.4869731664657593,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 385.0,
+      "completions/max_terminated_length": 385.0,
+      "completions/mean_length": 278.625,
+      "completions/mean_terminated_length": 318.42857142857144,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.048,
+      "format_failures": 0.0,
+      "grad_norm": 0.49245283007621765,
+      "kl": 0.01833944395184517,
+      "learning_rate": 1e-06,
+      "loss": 0.0385,
+      "num_tokens": 134920.0,
+      "reward": 0.543181836605072,
+      "reward_std": 0.3499283194541931,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 69.0,
+      "completions/max_terminated_length": 69.0,
+      "completions/mean_length": 39.5,
+      "completions/mean_terminated_length": 45.142857142857146,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.052,
+      "format_failures": 0.0,
+      "grad_norm": 0.0073767416179180145,
+      "kl": 0.0018603539792820811,
+      "learning_rate": 1e-06,
+      "loss": 0.0,
+      "num_tokens": 155632.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1250.0,
+      "completions/max_terminated_length": 1250.0,
+      "completions/mean_length": 381.25,
+      "completions/mean_terminated_length": 435.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.056,
+      "format_failures": 0.0,
+      "grad_norm": 0.4092012047767639,
+      "kl": 0.0037365095922723413,
+      "learning_rate": 1e-06,
+      "loss": 0.0532,
+      "num_tokens": 178856.0,
+      "reward": 0.08141025900840759,
+      "reward_std": 0.17304366827011108,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 480.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 282.25,
+      "completions/mean_terminated_length": 322.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 0.06,
+      "format_failures": 0.0,
+      "grad_norm": 0.4555729627609253,
+      "kl": 0.03388933837413788,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 189944.0,
+      "reward": 0.4369778633117676,
+      "reward_std": 0.3217828869819641,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 184.0,
+      "completions/max_terminated_length": 184.0,
+      "completions/mean_length": 87.625,
+      "completions/mean_terminated_length": 140.2,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 51.0,
+      "epoch": 0.064,
+      "format_failures": 0.0,
+      "grad_norm": 8.791972160339355,
+      "kl": 1.302387694362551,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 197912.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 321.0,
+      "completions/max_terminated_length": 321.0,
+      "completions/mean_length": 260.0,
+      "completions/mean_terminated_length": 297.14285714285717,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.068,
+      "format_failures": 0.0,
+      "grad_norm": 0.5435929298400879,
+      "kl": 0.016751494258642197,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 207184.0,
+      "reward": 0.0833333358168602,
+      "reward_std": 0.2357022762298584,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 69.0,
+      "completions/max_terminated_length": 69.0,
+      "completions/mean_length": 49.5,
+      "completions/mean_terminated_length": 56.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.072,
+      "format_failures": 0.0,
+      "grad_norm": 0.19489726424217224,
+      "kl": 0.061227064579725266,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 211464.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 354.0,
+      "completions/max_terminated_length": 354.0,
+      "completions/mean_length": 126.375,
+      "completions/mean_terminated_length": 144.42857142857142,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.076,
+      "format_failures": 0.0,
+      "grad_norm": 0.6644909381866455,
+      "kl": 0.010538576170802116,
+      "learning_rate": 1e-06,
+      "loss": -0.0572,
+      "num_tokens": 228504.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 92.0,
+      "completions/max_terminated_length": 92.0,
+      "completions/mean_length": 66.375,
+      "completions/mean_terminated_length": 75.85714285714286,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 58.0,
+      "epoch": 0.08,
+      "format_failures": 0.0,
+      "grad_norm": 1.534692645072937,
+      "kl": 0.03320205491036177,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 236336.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 155.0,
+      "completions/max_terminated_length": 155.0,
+      "completions/mean_length": 95.75,
+      "completions/mean_terminated_length": 109.42857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 0.084,
+      "format_failures": 0.0,
+      "grad_norm": 1.6011440753936768,
+      "kl": 0.028395552188158035,
+      "learning_rate": 1e-06,
+      "loss": 0.0329,
+      "num_tokens": 243416.0,
+      "reward": 0.28125,
+      "reward_std": 0.45193037390708923,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 292.0,
+      "completions/max_terminated_length": 292.0,
+      "completions/mean_length": 157.0,
+      "completions/mean_terminated_length": 179.42857142857142,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 74.0,
+      "epoch": 0.088,
+      "format_failures": 0.0,
+      "grad_norm": 0.8286527991294861,
+      "kl": 0.05863172188401222,
+      "learning_rate": 1e-06,
+      "loss": -0.0402,
+      "num_tokens": 251376.0,
+      "reward": 0.1830357164144516,
+      "reward_std": 0.28149792551994324,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 479.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 293.625,
+      "completions/mean_terminated_length": 335.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.092,
+      "format_failures": 0.0,
+      "grad_norm": 0.3510456383228302,
+      "kl": 0.04268372617661953,
+      "learning_rate": 1e-06,
+      "loss": -0.0068,
+      "num_tokens": 262824.0,
+      "reward": 0.29113247990608215,
+      "reward_std": 0.2665640711784363,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 403.0,
+      "completions/max_terminated_length": 403.0,
+      "completions/mean_length": 276.25,
+      "completions/mean_terminated_length": 315.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 66.0,
+      "epoch": 0.096,
+      "format_failures": 0.0,
+      "grad_norm": 0.6299352645874023,
+      "kl": 0.09684642031788826,
+      "learning_rate": 1e-06,
+      "loss": 0.0879,
+      "num_tokens": 273192.0,
+      "reward": 0.45770204067230225,
+      "reward_std": 0.4340135455131531,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 170.0,
+      "completions/max_terminated_length": 170.0,
+      "completions/mean_length": 123.125,
+      "completions/mean_terminated_length": 140.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 119.0,
+      "epoch": 0.1,
+      "format_failures": 0.0,
+      "grad_norm": 1.0716724395751953,
+      "kl": 0.08026151731610298,
+      "learning_rate": 1e-06,
+      "loss": 0.0274,
+      "num_tokens": 279688.0,
+      "reward": 0.5770493149757385,
+      "reward_std": 0.2756548523902893,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 395.0,
+      "completions/max_terminated_length": 395.0,
+      "completions/mean_length": 241.0,
+      "completions/mean_terminated_length": 275.42857142857144,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.104,
+      "format_failures": 0.0,
+      "grad_norm": 0.46685901284217834,
+      "kl": 0.06300827860832214,
+      "learning_rate": 1e-06,
+      "loss": -0.0169,
+      "num_tokens": 288160.0,
+      "reward": 0.4475490152835846,
+      "reward_std": 0.30980169773101807,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 147.0,
+      "completions/max_terminated_length": 147.0,
+      "completions/mean_length": 79.75,
+      "completions/mean_terminated_length": 91.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 52.0,
+      "epoch": 0.108,
+      "format_failures": 0.0,
+      "grad_norm": 1.3687446117401123,
+      "kl": 0.04298516921699047,
+      "learning_rate": 1e-06,
+      "loss": -0.0143,
+      "num_tokens": 293328.0,
+      "reward": 0.2083333432674408,
+      "reward_std": 0.39591163396835327,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 184.0,
+      "completions/max_terminated_length": 184.0,
+      "completions/mean_length": 93.75,
+      "completions/mean_terminated_length": 107.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 70.0,
+      "epoch": 0.112,
+      "format_failures": 0.0,
+      "grad_norm": 2.618457794189453,
+      "kl": 0.7109708972275257,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 301896.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 347.0,
+      "completions/mean_terminated_length": 396.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.116,
+      "format_failures": 0.0,
+      "grad_norm": 0.8099527955055237,
+      "kl": 0.004772833781316876,
+      "learning_rate": 1e-06,
+      "loss": 0.1672,
+      "num_tokens": 323048.0,
+      "reward": 0.3559523820877075,
+      "reward_std": 0.38564079999923706,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 178.0,
+      "completions/max_terminated_length": 178.0,
+      "completions/mean_length": 144.5,
+      "completions/mean_terminated_length": 165.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 0.12,
+      "format_failures": 0.0,
+      "grad_norm": 0.3579946756362915,
+      "kl": 0.02128867618739605,
+      "learning_rate": 1e-06,
+      "loss": -0.0092,
+      "num_tokens": 329880.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 300.0,
+      "completions/max_terminated_length": 300.0,
+      "completions/mean_length": 211.375,
+      "completions/mean_terminated_length": 241.57142857142858,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.124,
+      "format_failures": 0.0,
+      "grad_norm": 1.7018321752548218,
+      "kl": 0.014225509017705917,
+      "learning_rate": 1e-06,
+      "loss": -0.7754,
+      "num_tokens": 352120.0,
+      "reward": 0.2201923131942749,
+      "reward_std": 0.4099963307380676,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1071.0,
+      "completions/max_terminated_length": 1071.0,
+      "completions/mean_length": 291.375,
+      "completions/mean_terminated_length": 333.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 0.128,
+      "format_failures": 0.0,
+      "grad_norm": 0.8998605608940125,
+      "kl": 0.0065889437682926655,
+      "learning_rate": 1e-06,
+      "loss": 0.4427,
+      "num_tokens": 373128.0,
+      "reward": 0.3992924690246582,
+      "reward_std": 0.34711551666259766,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 62.0,
+      "completions/max_terminated_length": 62.0,
+      "completions/mean_length": 47.125,
+      "completions/mean_terminated_length": 53.857142857142854,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.132,
+      "format_failures": 0.0,
+      "grad_norm": 0.3835119307041168,
+      "kl": 0.1347430720925331,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 379176.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2049.0,
+      "completions/max_terminated_length": 2049.0,
+      "completions/mean_length": 523.25,
+      "completions/mean_terminated_length": 598.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.136,
+      "format_failures": 0.0,
+      "grad_norm": 0.5971964597702026,
+      "kl": 0.013881782768294215,
+      "learning_rate": 1e-06,
+      "loss": 0.3237,
+      "num_tokens": 400040.0,
+      "reward": 0.36666667461395264,
+      "reward_std": 0.4086368978023529,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 134.0,
+      "completions/max_terminated_length": 134.0,
+      "completions/mean_length": 70.0,
+      "completions/mean_terminated_length": 80.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.14,
+      "format_failures": 0.0,
+      "grad_norm": 1.4775596857070923,
+      "kl": 0.1210218146443367,
+      "learning_rate": 1e-06,
+      "loss": 0.0368,
+      "num_tokens": 404552.0,
+      "reward": 0.4513888955116272,
+      "reward_std": 0.4428916275501251,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 112.75,
+      "completions/mean_terminated_length": 128.85714285714286,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.144,
+      "format_failures": 0.0,
+      "grad_norm": 0.09341330826282501,
+      "kl": 0.08837828040122986,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 411528.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 259.0,
+      "completions/max_terminated_length": 259.0,
+      "completions/mean_length": 193.375,
+      "completions/mean_terminated_length": 221.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.148,
+      "format_failures": 0.0,
+      "grad_norm": 0.7524359822273254,
+      "kl": 0.04169362783432007,
+      "learning_rate": 1e-06,
+      "loss": 0.0556,
+      "num_tokens": 419160.0,
+      "reward": 0.5170454382896423,
+      "reward_std": 0.3414821922779083,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 64.0,
+      "completions/max_terminated_length": 64.0,
+      "completions/mean_length": 53.375,
+      "completions/mean_terminated_length": 61.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.152,
+      "format_failures": 0.0,
+      "grad_norm": 0.3160454034805298,
+      "kl": 0.07513360120356083,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 423768.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 485.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 301.125,
+      "completions/mean_terminated_length": 344.14285714285717,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.156,
+      "format_failures": 0.0,
+      "grad_norm": 1.2346574068069458,
+      "kl": 0.27855822443962097,
+      "learning_rate": 1e-06,
+      "loss": -0.0911,
+      "num_tokens": 435552.0,
+      "reward": 0.5062500238418579,
+      "reward_std": 0.43001821637153625,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 453.0,
+      "completions/mean_terminated_length": 517.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 0.16,
+      "format_failures": 0.0,
+      "grad_norm": 0.14271600544452667,
+      "kl": 0.011493591591715813,
+      "learning_rate": 1e-06,
+      "loss": -0.0636,
+      "num_tokens": 460960.0,
+      "reward": 0.5462301969528198,
+      "reward_std": 0.12065710872411728,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 106.0,
+      "completions/max_terminated_length": 106.0,
+      "completions/mean_length": 61.75,
+      "completions/mean_terminated_length": 70.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.164,
+      "format_failures": 0.0,
+      "grad_norm": 3.143950939178467,
+      "kl": 0.05912626534700394,
+      "learning_rate": 1e-06,
+      "loss": -0.0781,
+      "num_tokens": 469456.0,
+      "reward": 0.32499998807907104,
+      "reward_std": 0.46521884202957153,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 218.0,
+      "completions/max_terminated_length": 218.0,
+      "completions/mean_length": 81.5,
+      "completions/mean_terminated_length": 93.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 61.0,
+      "epoch": 0.168,
+      "format_failures": 0.0,
+      "grad_norm": 0.4773140251636505,
+      "kl": 0.2549777179956436,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 475288.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 172.0,
+      "completions/max_terminated_length": 172.0,
+      "completions/mean_length": 111.375,
+      "completions/mean_terminated_length": 127.28571428571429,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.172,
+      "format_failures": 0.0,
+      "grad_norm": 3.081820487976074,
+      "kl": 0.05859908275306225,
+      "learning_rate": 1e-06,
+      "loss": -0.132,
+      "num_tokens": 482848.0,
+      "reward": 0.375,
+      "reward_std": 0.5175491571426392,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 53.0,
+      "completions/max_terminated_length": 53.0,
+      "completions/mean_length": 22.5,
+      "completions/mean_terminated_length": 45.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.176,
+      "format_failures": 1.0,
+      "grad_norm": 0.5130624175071716,
+      "kl": 0.034313835203647614,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 488336.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 145.0,
+      "completions/max_terminated_length": 145.0,
+      "completions/mean_length": 80.125,
+      "completions/mean_terminated_length": 91.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 68.0,
+      "epoch": 0.18,
+      "format_failures": 0.0,
+      "grad_norm": 4.689250946044922,
+      "kl": 0.8184864521026611,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 497608.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 128.0,
+      "completions/max_terminated_length": 128.0,
+      "completions/mean_length": 94.75,
+      "completions/mean_terminated_length": 108.28571428571429,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 78.0,
+      "epoch": 0.184,
+      "format_failures": 0.0,
+      "grad_norm": 1.715865135192871,
+      "kl": 0.04798049572855234,
+      "learning_rate": 1e-06,
+      "loss": 0.0176,
+      "num_tokens": 506248.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.35634833574295044,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 328.0,
+      "completions/max_terminated_length": 328.0,
+      "completions/mean_length": 235.75,
+      "completions/mean_terminated_length": 269.42857142857144,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.188,
+      "format_failures": 0.0,
+      "grad_norm": 0.770552933216095,
+      "kl": 0.026089726015925407,
+      "learning_rate": 1e-06,
+      "loss": -0.1374,
+      "num_tokens": 526432.0,
+      "reward": 0.615674614906311,
+      "reward_std": 0.2741951644420624,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 170.0,
+      "completions/max_terminated_length": 170.0,
+      "completions/mean_length": 131.875,
+      "completions/mean_terminated_length": 150.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.192,
+      "format_failures": 0.0,
+      "grad_norm": 0.5728135108947754,
+      "kl": 0.08094584196805954,
+      "learning_rate": 1e-06,
+      "loss": -0.0257,
+      "num_tokens": 534408.0,
+      "reward": 0.09375,
+      "reward_std": 0.2651650309562683,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 156.0,
+      "completions/max_terminated_length": 156.0,
+      "completions/mean_length": 116.25,
+      "completions/mean_terminated_length": 132.85714285714286,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.196,
+      "format_failures": 1.0,
+      "grad_norm": 1.1056642532348633,
+      "kl": 0.5166730880737305,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 542224.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 275.0,
+      "completions/max_terminated_length": 275.0,
+      "completions/mean_length": 102.375,
+      "completions/mean_terminated_length": 117.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 49.0,
+      "epoch": 0.2,
+      "format_failures": 0.0,
+      "grad_norm": 2.430076837539673,
+      "kl": 0.04860229790210724,
+      "learning_rate": 1e-06,
+      "loss": 1.0757,
+      "num_tokens": 557760.0,
+      "reward": 0.7159091234207153,
+      "reward_std": 0.41142913699150085,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 80.0,
+      "completions/max_terminated_length": 80.0,
+      "completions/mean_length": 48.25,
+      "completions/mean_terminated_length": 64.33333333333333,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.204,
+      "format_failures": 0.0,
+      "grad_norm": 62.65098190307617,
+      "kl": 8.405316352844238,
+      "learning_rate": 1e-06,
+      "loss": 0.0048,
+      "num_tokens": 565040.0,
+      "reward": 0.75,
+      "reward_std": 0.4629100561141968,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2051.0,
+      "completions/max_terminated_length": 2051.0,
+      "completions/mean_length": 493.75,
+      "completions/mean_terminated_length": 564.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.208,
+      "format_failures": 0.0,
+      "grad_norm": 1.2885843515396118,
+      "kl": 0.019100312143564224,
+      "learning_rate": 1e-06,
+      "loss": -0.1233,
+      "num_tokens": 587416.0,
+      "reward": 0.22045454382896423,
+      "reward_std": 0.5661183595657349,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 321.0,
+      "completions/max_terminated_length": 321.0,
+      "completions/mean_length": 195.375,
+      "completions/mean_terminated_length": 223.28571428571428,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 71.0,
+      "epoch": 0.212,
+      "format_failures": 0.0,
+      "grad_norm": 1.2086297273635864,
+      "kl": 0.17173044383525848,
+      "learning_rate": 1e-06,
+      "loss": -0.0158,
+      "num_tokens": 598440.0,
+      "reward": 0.25,
+      "reward_std": 0.4629100561141968,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 367.0,
+      "completions/max_terminated_length": 367.0,
+      "completions/mean_length": 270.375,
+      "completions/mean_terminated_length": 309.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.216,
+      "format_failures": 0.0,
+      "grad_norm": 0.35032373666763306,
+      "kl": 0.040101515129208565,
+      "learning_rate": 1e-06,
+      "loss": 0.0421,
+      "num_tokens": 608128.0,
+      "reward": 0.5280122756958008,
+      "reward_std": 0.23830601572990417,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 69.0,
+      "completions/max_terminated_length": 69.0,
+      "completions/mean_length": 57.5,
+      "completions/mean_terminated_length": 65.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.22,
+      "format_failures": 0.0,
+      "grad_norm": 1.1529607772827148,
+      "kl": 0.18720119446516037,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 614320.0,
+      "reward": 0.1875,
+      "reward_std": 0.1157275140285492,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 199.0,
+      "completions/max_terminated_length": 199.0,
+      "completions/mean_length": 160.625,
+      "completions/mean_terminated_length": 183.57142857142858,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.224,
+      "format_failures": 0.0,
+      "grad_norm": 1.1264208555221558,
+      "kl": 0.08833763748407364,
+      "learning_rate": 1e-06,
+      "loss": 0.0454,
+      "num_tokens": 623424.0,
+      "reward": 0.2916666567325592,
+      "reward_std": 0.4520675837993622,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 242.0,
+      "completions/max_terminated_length": 242.0,
+      "completions/mean_length": 184.625,
+      "completions/mean_terminated_length": 211.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.228,
+      "format_failures": 0.0,
+      "grad_norm": 0.03325602412223816,
+      "kl": 0.04312510974705219,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 631976.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 95.0,
+      "completions/max_terminated_length": 95.0,
+      "completions/mean_length": 69.5,
+      "completions/mean_terminated_length": 79.42857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.232,
+      "format_failures": 0.0,
+      "grad_norm": 0.04106176272034645,
+      "kl": 0.026903850957751274,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 637776.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 508.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 273.5,
+      "completions/mean_terminated_length": 312.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.236,
+      "format_failures": 0.0,
+      "grad_norm": 0.11601117998361588,
+      "kl": 0.0859757624566555,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 650616.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 395.0,
+      "completions/max_terminated_length": 395.0,
+      "completions/mean_length": 298.75,
+      "completions/mean_terminated_length": 341.42857142857144,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.24,
+      "format_failures": 0.0,
+      "grad_norm": 0.3067856729030609,
+      "kl": 0.1856345497071743,
+      "learning_rate": 1e-06,
+      "loss": 0.033,
+      "num_tokens": 659168.0,
+      "reward": 0.5159090757369995,
+      "reward_std": 0.1970113068819046,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 71.0,
+      "completions/max_terminated_length": 71.0,
+      "completions/mean_length": 58.75,
+      "completions/mean_terminated_length": 67.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 53.0,
+      "epoch": 0.244,
+      "format_failures": 0.0,
+      "grad_norm": 1.8961628675460815,
+      "kl": 0.0375029481947422,
+      "learning_rate": 1e-06,
+      "loss": -0.0261,
+      "num_tokens": 663984.0,
+      "reward": 0.8500000238418579,
+      "reward_std": 0.3505098223686218,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 428.0,
+      "completions/max_terminated_length": 428.0,
+      "completions/mean_length": 251.625,
+      "completions/mean_terminated_length": 287.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.248,
+      "format_failures": 0.0,
+      "grad_norm": 0.2780621349811554,
+      "kl": 0.05490433797240257,
+      "learning_rate": 1e-06,
+      "loss": 0.0335,
+      "num_tokens": 672792.0,
+      "reward": 0.5874999761581421,
+      "reward_std": 0.16226325929164886,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 318.0,
+      "completions/max_terminated_length": 318.0,
+      "completions/mean_length": 208.25,
+      "completions/mean_terminated_length": 238.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.252,
+      "format_failures": 0.0,
+      "grad_norm": 1.4749125242233276,
+      "kl": 0.11917952820658684,
+      "learning_rate": 1e-06,
+      "loss": 0.5038,
+      "num_tokens": 692808.0,
+      "reward": 0.5503472089767456,
+      "reward_std": 0.4739660620689392,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 209.0,
+      "completions/max_terminated_length": 209.0,
+      "completions/mean_length": 121.625,
+      "completions/mean_terminated_length": 139.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 80.0,
+      "epoch": 0.256,
+      "format_failures": 0.0,
+      "grad_norm": 0.9009966850280762,
+      "kl": 0.04829751141369343,
+      "learning_rate": 1e-06,
+      "loss": -0.0333,
+      "num_tokens": 713192.0,
+      "reward": 0.4819444417953491,
+      "reward_std": 0.13385315239429474,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 299.0,
+      "completions/max_terminated_length": 299.0,
+      "completions/mean_length": 202.875,
+      "completions/mean_terminated_length": 231.85714285714286,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 126.0,
+      "epoch": 0.26,
+      "format_failures": 0.0,
+      "grad_norm": 0.6346305012702942,
+      "kl": 0.08024599775671959,
+      "learning_rate": 1e-06,
+      "loss": 0.067,
+      "num_tokens": 720472.0,
+      "reward": 0.25189393758773804,
+      "reward_std": 0.2742690443992615,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 527.0,
+      "completions/max_terminated_length": 527.0,
+      "completions/mean_length": 361.875,
+      "completions/mean_terminated_length": 413.57142857142856,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.264,
+      "format_failures": 0.0,
+      "grad_norm": 0.3846381604671478,
+      "kl": 0.03228219784796238,
+      "learning_rate": 1e-06,
+      "loss": 0.0465,
+      "num_tokens": 732112.0,
+      "reward": 0.3159722089767456,
+      "reward_std": 0.2696826457977295,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 652.0,
+      "completions/max_terminated_length": 652.0,
+      "completions/mean_length": 350.25,
+      "completions/mean_terminated_length": 400.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 0.268,
+      "format_failures": 0.0,
+      "grad_norm": 0.2731687128543854,
+      "kl": 0.02399719413369894,
+      "learning_rate": 1e-06,
+      "loss": 0.0932,
+      "num_tokens": 752768.0,
+      "reward": 0.48750001192092896,
+      "reward_std": 0.17878557741641998,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 115.0,
+      "completions/max_terminated_length": 115.0,
+      "completions/mean_length": 57.5,
+      "completions/mean_terminated_length": 65.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.272,
+      "format_failures": 0.0,
+      "grad_norm": 0.5548056960105896,
+      "kl": 0.3331392854452133,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 758312.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 236.0,
+      "completions/max_terminated_length": 236.0,
+      "completions/mean_length": 96.0,
+      "completions/mean_terminated_length": 109.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.276,
+      "format_failures": 0.0,
+      "grad_norm": 5.602732181549072,
+      "kl": 1.5559703707695007,
+      "learning_rate": 1e-06,
+      "loss": -0.0333,
+      "num_tokens": 765560.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 505.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 288.25,
+      "completions/mean_terminated_length": 329.42857142857144,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.28,
+      "format_failures": 0.0,
+      "grad_norm": 0.28595268726348877,
+      "kl": 0.05494564212858677,
+      "learning_rate": 1e-06,
+      "loss": 0.0934,
+      "num_tokens": 782656.0,
+      "reward": 0.2406907081604004,
+      "reward_std": 0.2288402020931244,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 389.0,
+      "completions/max_terminated_length": 389.0,
+      "completions/mean_length": 306.375,
+      "completions/mean_terminated_length": 350.14285714285717,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 0.284,
+      "format_failures": 0.0,
+      "grad_norm": 0.4375990033149719,
+      "kl": 0.15084227919578552,
+      "learning_rate": 1e-06,
+      "loss": -0.0137,
+      "num_tokens": 792064.0,
+      "reward": 0.6625000238418579,
+      "reward_std": 0.31139087677001953,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 126.0,
+      "completions/max_terminated_length": 126.0,
+      "completions/mean_length": 15.75,
+      "completions/mean_terminated_length": 126.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 126.0,
+      "epoch": 0.288,
+      "format_failures": 0.0,
+      "grad_norm": 4.322110652923584,
+      "kl": 0.025789054110646248,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 800160.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 433.0,
+      "completions/max_terminated_length": 433.0,
+      "completions/mean_length": 276.75,
+      "completions/mean_terminated_length": 316.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.292,
+      "format_failures": 0.0,
+      "grad_norm": 0.6268705725669861,
+      "kl": 0.08498941920697689,
+      "learning_rate": 1e-06,
+      "loss": -0.0614,
+      "num_tokens": 810104.0,
+      "reward": 0.41130954027175903,
+      "reward_std": 0.3625659644603729,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 277.0,
+      "completions/max_terminated_length": 277.0,
+      "completions/mean_length": 221.125,
+      "completions/mean_terminated_length": 252.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 0.296,
+      "format_failures": 0.0,
+      "grad_norm": 0.8162993788719177,
+      "kl": 0.0225818594917655,
+      "learning_rate": 1e-06,
+      "loss": 0.0523,
+      "num_tokens": 822920.0,
+      "reward": 0.7083333730697632,
+      "reward_std": 0.4520675837993622,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 493.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 249.625,
+      "completions/mean_terminated_length": 285.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 0.3,
+      "format_failures": 0.0,
+      "grad_norm": 0.3638235032558441,
+      "kl": 0.10483588464558125,
+      "learning_rate": 1e-06,
+      "loss": 0.093,
+      "num_tokens": 833608.0,
+      "reward": 0.2819444537162781,
+      "reward_std": 0.2347228229045868,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 315.0,
+      "completions/max_terminated_length": 315.0,
+      "completions/mean_length": 92.75,
+      "completions/mean_terminated_length": 106.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 58.0,
+      "epoch": 0.304,
+      "format_failures": 0.0,
+      "grad_norm": 0.005022191442549229,
+      "kl": 0.01964521873742342,
+      "learning_rate": 1e-06,
+      "loss": 0.0,
+      "num_tokens": 855560.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1057.0,
+      "completions/max_terminated_length": 1057.0,
+      "completions/mean_length": 300.25,
+      "completions/mean_terminated_length": 343.14285714285717,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.308,
+      "format_failures": 0.0,
+      "grad_norm": 0.6531589031219482,
+      "kl": 0.1464357189834118,
+      "learning_rate": 1e-06,
+      "loss": -0.0231,
+      "num_tokens": 873712.0,
+      "reward": 0.24836310744285583,
+      "reward_std": 0.23662379384040833,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 468.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 271.125,
+      "completions/mean_terminated_length": 309.85714285714283,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 65.0,
+      "epoch": 0.312,
+      "format_failures": 0.0,
+      "grad_norm": 0.585955023765564,
+      "kl": 0.0404690857976675,
+      "learning_rate": 1e-06,
+      "loss": 0.0946,
+      "num_tokens": 882408.0,
+      "reward": 0.6073564291000366,
+      "reward_std": 0.39037758111953735,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 174.0,
+      "completions/max_terminated_length": 174.0,
+      "completions/mean_length": 75.25,
+      "completions/mean_terminated_length": 150.5,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.316,
+      "format_failures": 0.0,
+      "grad_norm": 4.991185188293457,
+      "kl": 0.13191331177949905,
+      "learning_rate": 1e-06,
+      "loss": -0.1159,
+      "num_tokens": 891768.0,
+      "reward": 0.5,
+      "reward_std": 0.5345224738121033,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 290.0,
+      "completions/max_terminated_length": 290.0,
+      "completions/mean_length": 244.0,
+      "completions/mean_terminated_length": 278.85714285714283,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 0.32,
+      "format_failures": 0.0,
+      "grad_norm": 1.556532621383667,
+      "kl": 0.30473417043685913,
+      "learning_rate": 1e-06,
+      "loss": -0.0131,
+      "num_tokens": 900480.0,
+      "reward": 0.375,
+      "reward_std": 0.5175491571426392,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 56.0,
+      "completions/max_terminated_length": 56.0,
+      "completions/mean_length": 37.25,
+      "completions/mean_terminated_length": 42.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.324,
+      "format_failures": 0.0,
+      "grad_norm": 6.271825790405273,
+      "kl": 1.4292120337486267,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 905384.0,
+      "reward": 0.09375,
+      "reward_std": 0.2651650309562683,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 155.0,
+      "completions/max_terminated_length": 155.0,
+      "completions/mean_length": 106.375,
+      "completions/mean_terminated_length": 121.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 62.0,
+      "epoch": 0.328,
+      "format_failures": 0.0,
+      "grad_norm": 1.0847584009170532,
+      "kl": 0.4334397315979004,
+      "learning_rate": 1e-06,
+      "loss": 0.0061,
+      "num_tokens": 912488.0,
+      "reward": 0.1875,
+      "reward_std": 0.2642374634742737,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 518.0,
+      "completions/max_terminated_length": 518.0,
+      "completions/mean_length": 445.625,
+      "completions/mean_terminated_length": 509.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 482.0,
+      "epoch": 0.332,
+      "format_failures": 0.0,
+      "grad_norm": 0.3242776393890381,
+      "kl": 0.028012586757540703,
+      "learning_rate": 1e-06,
+      "loss": -0.0109,
+      "num_tokens": 927144.0,
+      "reward": 0.6354166865348816,
+      "reward_std": 0.41770702600479126,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 162.0,
+      "completions/max_terminated_length": 162.0,
+      "completions/mean_length": 80.75,
+      "completions/mean_terminated_length": 92.28571428571429,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 60.0,
+      "epoch": 0.336,
+      "format_failures": 0.0,
+      "grad_norm": 2.415727376937866,
+      "kl": 1.3026588559150696,
+      "learning_rate": 1e-06,
+      "loss": -0.0519,
+      "num_tokens": 933024.0,
+      "reward": 0.2708333432674408,
+      "reward_std": 0.39778655767440796,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 509.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 287.25,
+      "completions/mean_terminated_length": 328.2857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.34,
+      "format_failures": 0.0,
+      "grad_norm": 0.5015918612480164,
+      "kl": 0.07602308504283428,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 943576.0,
+      "reward": 0.35555556416511536,
+      "reward_std": 0.330837219953537,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 147.0,
+      "completions/max_terminated_length": 147.0,
+      "completions/mean_length": 92.875,
+      "completions/mean_terminated_length": 106.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.344,
+      "format_failures": 0.0,
+      "grad_norm": 3162.383056640625,
+      "kl": 592.9862050414085,
+      "learning_rate": 1e-06,
+      "loss": 4.4073,
+      "num_tokens": 950272.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 159.0,
+      "completions/max_terminated_length": 159.0,
+      "completions/mean_length": 78.875,
+      "completions/mean_terminated_length": 157.75,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.348,
+      "format_failures": 0.0,
+      "grad_norm": 5.812924385070801,
+      "kl": 0.03395126201212406,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 956992.0,
+      "reward": 0.625,
+      "reward_std": 0.5175491571426392,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 122.0,
+      "completions/max_terminated_length": 122.0,
+      "completions/mean_length": 85.5,
+      "completions/mean_terminated_length": 97.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 75.0,
+      "epoch": 0.352,
+      "format_failures": 0.0,
+      "grad_norm": 1.021047830581665,
+      "kl": 0.18108929693698883,
+      "learning_rate": 1e-06,
+      "loss": 0.0333,
+      "num_tokens": 962232.0,
+      "reward": 0.5613095164299011,
+      "reward_std": 0.23917356133460999,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 364.0,
+      "completions/max_terminated_length": 364.0,
+      "completions/mean_length": 119.625,
+      "completions/mean_terminated_length": 136.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 61.0,
+      "epoch": 0.356,
+      "format_failures": 0.0,
+      "grad_norm": 1.7294161319732666,
+      "kl": 0.41833993047475815,
+      "learning_rate": 1e-06,
+      "loss": 0.1262,
+      "num_tokens": 975584.0,
+      "reward": 0.109375,
+      "reward_std": 0.14250017702579498,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 215.0,
+      "completions/max_terminated_length": 215.0,
+      "completions/mean_length": 138.0,
+      "completions/mean_terminated_length": 157.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.36,
+      "format_failures": 0.0,
+      "grad_norm": 64.84632873535156,
+      "kl": 29.09031867980957,
+      "learning_rate": 1e-06,
+      "loss": 0.188,
+      "num_tokens": 981256.0,
+      "reward": 0.7403273582458496,
+      "reward_std": 0.17907913029193878,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 195.0,
+      "completions/max_terminated_length": 195.0,
+      "completions/mean_length": 91.125,
+      "completions/mean_terminated_length": 121.5,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 64.0,
+      "epoch": 0.364,
+      "format_failures": 0.0,
+      "grad_norm": 84280.7421875,
+      "kl": 6375.002594873309,
+      "learning_rate": 1e-06,
+      "loss": 96.8935,
+      "num_tokens": 992408.0,
+      "reward": 0.1666666716337204,
+      "reward_std": 0.35634833574295044,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 405.0,
+      "completions/max_terminated_length": 405.0,
+      "completions/mean_length": 301.625,
+      "completions/mean_terminated_length": 344.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.368,
+      "format_failures": 0.0,
+      "grad_norm": 1.2784438133239746,
+      "kl": 0.7278856039047241,
+      "learning_rate": 1e-06,
+      "loss": 0.1895,
+      "num_tokens": 1010848.0,
+      "reward": 0.39940476417541504,
+      "reward_std": 0.3344242572784424,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 435.0,
+      "completions/max_terminated_length": 435.0,
+      "completions/mean_length": 275.375,
+      "completions/mean_terminated_length": 314.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.372,
+      "format_failures": 0.0,
+      "grad_norm": 0.2864258289337158,
+      "kl": 0.08188853040337563,
+      "learning_rate": 1e-06,
+      "loss": 0.0434,
+      "num_tokens": 1020488.0,
+      "reward": 0.5244791507720947,
+      "reward_std": 0.16294103860855103,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 105.0,
+      "completions/max_terminated_length": 105.0,
+      "completions/mean_length": 58.25,
+      "completions/mean_terminated_length": 66.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.376,
+      "format_failures": 0.0,
+      "grad_norm": 22.611703872680664,
+      "kl": 1.769313856959343,
+      "learning_rate": 1e-06,
+      "loss": 0.0731,
+      "num_tokens": 1028496.0,
+      "reward": 0.5,
+      "reward_std": 0.37796446681022644,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 252.0,
+      "completions/max_terminated_length": 252.0,
+      "completions/mean_length": 136.625,
+      "completions/mean_terminated_length": 156.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 94.0,
+      "epoch": 0.38,
+      "format_failures": 0.0,
+      "grad_norm": 3.4661715030670166,
+      "kl": 0.3033728860318661,
+      "learning_rate": 1e-06,
+      "loss": 0.0753,
+      "num_tokens": 1038480.0,
+      "reward": 0.4479166567325592,
+      "reward_std": 0.41052013635635376,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 142.0,
+      "completions/max_terminated_length": 142.0,
+      "completions/mean_length": 109.75,
+      "completions/mean_terminated_length": 125.42857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 66.0,
+      "epoch": 0.384,
+      "format_failures": 0.0,
+      "grad_norm": 2.9459471702575684,
+      "kl": 0.8582945615053177,
+      "learning_rate": 1e-06,
+      "loss": -0.0371,
+      "num_tokens": 1048240.0,
+      "reward": 0.6180555820465088,
+      "reward_std": 0.42537203431129456,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 76.0,
+      "completions/max_terminated_length": 76.0,
+      "completions/mean_length": 64.0,
+      "completions/mean_terminated_length": 73.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 72.0,
+      "epoch": 0.388,
+      "format_failures": 0.0,
+      "grad_norm": 0.43100497126579285,
+      "kl": 0.06553871184587479,
+      "learning_rate": 1e-06,
+      "loss": 0.0368,
+      "num_tokens": 1054728.0,
+      "reward": 0.9642857313156128,
+      "reward_std": 0.10101523995399475,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 66.0,
+      "completions/max_terminated_length": 66.0,
+      "completions/mean_length": 40.25,
+      "completions/mean_terminated_length": 46.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.392,
+      "format_failures": 0.0,
+      "grad_norm": 6.787167072296143,
+      "kl": 1.8237296342849731,
+      "learning_rate": 1e-06,
+      "loss": 0.0307,
+      "num_tokens": 1060304.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 344.0,
+      "completions/max_terminated_length": 344.0,
+      "completions/mean_length": 179.125,
+      "completions/mean_terminated_length": 204.71428571428572,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 0.396,
+      "format_failures": 0.0,
+      "grad_norm": 1.4021135568618774,
+      "kl": 0.06424028240144253,
+      "learning_rate": 1e-06,
+      "loss": -0.0136,
+      "num_tokens": 1082672.0,
+      "reward": 0.490579217672348,
+      "reward_std": 0.34001559019088745,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 189.0,
+      "completions/max_terminated_length": 189.0,
+      "completions/mean_length": 52.5,
+      "completions/mean_terminated_length": 140.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 101.0,
+      "epoch": 0.4,
+      "format_failures": 0.0,
+      "grad_norm": 4.928287982940674,
+      "kl": 0.6296049430966377,
+      "learning_rate": 1e-06,
+      "loss": -0.2632,
+      "num_tokens": 1090648.0,
+      "reward": 0.75,
+      "reward_std": 0.4629100561141968,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 104.0,
+      "completions/max_terminated_length": 104.0,
+      "completions/mean_length": 73.125,
+      "completions/mean_terminated_length": 83.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 70.0,
+      "epoch": 0.404,
+      "format_failures": 1.0,
+      "grad_norm": 1.0927364826202393,
+      "kl": 0.4457448348402977,
+      "learning_rate": 1e-06,
+      "loss": 0.0054,
+      "num_tokens": 1099240.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 141.0,
+      "completions/max_terminated_length": 141.0,
+      "completions/mean_length": 100.625,
+      "completions/mean_terminated_length": 115.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.408,
+      "format_failures": 0.0,
+      "grad_norm": 0.3484640419483185,
+      "kl": 0.014615435153245926,
+      "learning_rate": 1e-06,
+      "loss": -0.0011,
+      "num_tokens": 1106512.0,
+      "reward": 0.8717262148857117,
+      "reward_std": 0.1315489560365677,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 120.0,
+      "completions/max_terminated_length": 120.0,
+      "completions/mean_length": 79.0,
+      "completions/mean_terminated_length": 90.28571428571429,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 0.412,
+      "format_failures": 0.0,
+      "grad_norm": 2.578859329223633,
+      "kl": 0.05575744202360511,
+      "learning_rate": 1e-06,
+      "loss": -0.1115,
+      "num_tokens": 1113224.0,
+      "reward": 0.53125,
+      "reward_std": 0.41052013635635376,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 299.0,
+      "completions/max_terminated_length": 299.0,
+      "completions/mean_length": 224.125,
+      "completions/mean_terminated_length": 256.14285714285717,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.416,
+      "format_failures": 0.0,
+      "grad_norm": 0.1279614269733429,
+      "kl": 0.008564054034650326,
+      "learning_rate": 1e-06,
+      "loss": -0.0026,
+      "num_tokens": 1120832.0,
+      "reward": 0.6416666507720947,
+      "reward_std": 0.08864051848649979,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 143.0,
+      "completions/max_terminated_length": 143.0,
+      "completions/mean_length": 120.75,
+      "completions/mean_terminated_length": 138.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 125.0,
+      "epoch": 0.42,
+      "format_failures": 0.0,
+      "grad_norm": 2.8721704483032227,
+      "kl": 0.028846602886915207,
+      "learning_rate": 1e-06,
+      "loss": 0.1163,
+      "num_tokens": 1129032.0,
+      "reward": 0.5833333730697632,
+      "reward_std": 0.49601587653160095,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 92.0,
+      "completions/max_terminated_length": 92.0,
+      "completions/mean_length": 61.0,
+      "completions/mean_terminated_length": 69.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.424,
+      "format_failures": 0.0,
+      "grad_norm": 0.4012051820755005,
+      "kl": 0.13534526526927948,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 1137584.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 266.0,
+      "completions/max_terminated_length": 266.0,
+      "completions/mean_length": 169.0,
+      "completions/mean_terminated_length": 193.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.428,
+      "format_failures": 0.0,
+      "grad_norm": 0.34922441840171814,
+      "kl": 0.014531925320625305,
+      "learning_rate": 1e-06,
+      "loss": 0.0412,
+      "num_tokens": 1144344.0,
+      "reward": 0.3263888955116272,
+      "reward_std": 0.19911068677902222,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 54.0,
+      "completions/max_terminated_length": 54.0,
+      "completions/mean_length": 44.75,
+      "completions/mean_terminated_length": 51.142857142857146,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.432,
+      "format_failures": 0.0,
+      "grad_norm": 0.8536809682846069,
+      "kl": 0.01497908541932702,
+      "learning_rate": 1e-06,
+      "loss": 0.0128,
+      "num_tokens": 1148760.0,
+      "reward": 0.90625,
+      "reward_std": 0.2651650309562683,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 110.0,
+      "completions/max_terminated_length": 110.0,
+      "completions/mean_length": 85.375,
+      "completions/mean_terminated_length": 97.57142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 79.0,
+      "epoch": 0.436,
+      "format_failures": 0.0,
+      "grad_norm": 3.196063995361328,
+      "kl": 0.09259714558720589,
+      "learning_rate": 1e-06,
+      "loss": -0.0254,
+      "num_tokens": 1154816.0,
+      "reward": 0.375,
+      "reward_std": 0.5175491571426392,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 122.0,
+      "completions/max_terminated_length": 122.0,
+      "completions/mean_length": 93.125,
+      "completions/mean_terminated_length": 106.42857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.44,
+      "format_failures": 0.0,
+      "grad_norm": 2.7271082401275635,
+      "kl": 0.04449745221063495,
+      "learning_rate": 1e-06,
+      "loss": -0.0126,
+      "num_tokens": 1163208.0,
+      "reward": 0.4166666865348816,
+      "reward_std": 0.34503278136253357,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 147.0,
+      "completions/max_terminated_length": 147.0,
+      "completions/mean_length": 114.5,
+      "completions/mean_terminated_length": 130.85714285714286,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 62.0,
+      "epoch": 0.444,
+      "format_failures": 0.0,
+      "grad_norm": 0.10579583793878555,
+      "kl": 0.055604600347578526,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 1174056.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 173.0,
+      "completions/max_terminated_length": 173.0,
+      "completions/mean_length": 146.375,
+      "completions/mean_terminated_length": 167.28571428571428,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.448,
+      "format_failures": 0.0,
+      "grad_norm": 0.41507479548454285,
+      "kl": 0.019602006301283836,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 1181760.0,
+      "reward": 0.9750000238418579,
+      "reward_std": 0.0707106813788414,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 59.0,
+      "completions/max_terminated_length": 59.0,
+      "completions/mean_length": 46.75,
+      "completions/mean_terminated_length": 53.42857142857143,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 41.0,
+      "epoch": 0.452,
+      "format_failures": 0.0,
+      "grad_norm": 2.7538251876831055,
+      "kl": 0.05537968873977661,
+      "learning_rate": 1e-06,
+      "loss": -0.0324,
+      "num_tokens": 1187360.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.5019802451133728,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 654.0,
+      "completions/max_terminated_length": 654.0,
+      "completions/mean_length": 341.875,
+      "completions/mean_terminated_length": 390.7142857142857,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.456,
+      "format_failures": 0.0,
+      "grad_norm": 0.6517180800437927,
+      "kl": 0.01990941632539034,
+      "learning_rate": 1e-06,
+      "loss": -0.0628,
+      "num_tokens": 1200928.0,
+      "reward": 0.3812499940395355,
+      "reward_std": 0.4225243031978607,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 284.0,
+      "completions/max_terminated_length": 284.0,
+      "completions/mean_length": 212.625,
+      "completions/mean_terminated_length": 283.5,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.46,
+      "format_failures": 0.0,
+      "grad_norm": 2.6183741092681885,
+      "kl": 0.3156433766707778,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 1209616.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 187.0,
+      "completions/max_terminated_length": 187.0,
+      "completions/mean_length": 127.625,
+      "completions/mean_terminated_length": 145.85714285714286,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.464,
+      "format_failures": 0.0,
+      "grad_norm": 1.40922212600708,
+      "kl": 0.3603953216224909,
+      "learning_rate": 1e-06,
+      "loss": -0.1,
+      "num_tokens": 1216840.0,
+      "reward": 0.375,
+      "reward_std": 0.4520675837993622,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 240.0,
+      "completions/max_terminated_length": 240.0,
+      "completions/mean_length": 172.875,
+      "completions/mean_terminated_length": 197.57142857142858,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.468,
+      "format_failures": 0.0,
+      "grad_norm": 0.5828225612640381,
+      "kl": 0.013718126341700554,
+      "learning_rate": 1e-06,
+      "loss": -0.0188,
+      "num_tokens": 1223464.0,
+      "reward": 0.3083333373069763,
+      "reward_std": 0.2980092763900757,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 296.0,
+      "completions/max_terminated_length": 296.0,
+      "completions/mean_length": 113.0,
+      "completions/mean_terminated_length": 129.14285714285714,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.472,
+      "format_failures": 0.0,
+      "grad_norm": 1.907884120941162,
+      "kl": 0.16990539245307446,
+      "learning_rate": 1e-06,
+      "loss": 0.1637,
+      "num_tokens": 1231632.0,
+      "reward": 0.265625,
+      "reward_std": 0.45531338453292847,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 289.0,
+      "completions/max_terminated_length": 289.0,
+      "completions/mean_length": 139.125,
+      "completions/mean_terminated_length": 159.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.476,
+      "format_failures": 0.0,
+      "grad_norm": 0.5671705007553101,
+      "kl": 0.0328083336353302,
+      "learning_rate": 1e-06,
+      "loss": 0.1641,
+      "num_tokens": 1242688.0,
+      "reward": 0.6208333373069763,
+      "reward_std": 0.3646862208843231,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 123.0,
+      "completions/max_terminated_length": 123.0,
+      "completions/mean_length": 74.375,
+      "completions/mean_terminated_length": 85.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.48,
+      "format_failures": 0.0,
+      "grad_norm": 6.129162788391113,
+      "kl": 2.631644606590271,
+      "learning_rate": 1e-06,
+      "loss": -0.0072,
+      "num_tokens": 1250712.0,
+      "reward": 0.0833333358168602,
+      "reward_std": 0.2357022762298584,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 77.0,
+      "completions/max_terminated_length": 77.0,
+      "completions/mean_length": 33.25,
+      "completions/mean_terminated_length": 66.5,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.484,
+      "format_failures": 0.0,
+      "grad_norm": 2.2025108337402344,
+      "kl": 0.009498461615294218,
+      "learning_rate": 1e-06,
+      "loss": -0.1436,
+      "num_tokens": 1255560.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 582.0,
+      "completions/max_terminated_length": 582.0,
+      "completions/mean_length": 367.875,
+      "completions/mean_terminated_length": 490.5,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.488,
+      "format_failures": 0.0,
+      "grad_norm": 0.6904863715171814,
+      "kl": 0.20124347042292356,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 1273976.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 346.0,
+      "completions/max_terminated_length": 346.0,
+      "completions/mean_length": 230.25,
+      "completions/mean_terminated_length": 263.14285714285717,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.492,
+      "format_failures": 0.0,
+      "grad_norm": 0.813983142375946,
+      "kl": 0.09101713076233864,
+      "learning_rate": 1e-06,
+      "loss": 0.0777,
+      "num_tokens": 1282880.0,
+      "reward": 0.6432539820671082,
+      "reward_std": 0.3272421360015869,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 52.0,
+      "completions/max_terminated_length": 52.0,
+      "completions/mean_length": 41.75,
+      "completions/mean_terminated_length": 47.714285714285715,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.496,
+      "format_failures": 0.0,
+      "grad_norm": 4.916449546813965,
+      "kl": 0.6664696265943348,
+      "learning_rate": 1e-06,
+      "loss": -0.0205,
+      "num_tokens": 1288032.0,
+      "reward": 0.8125,
+      "reward_std": 0.3720118999481201,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 111.0,
+      "completions/max_terminated_length": 111.0,
+      "completions/mean_length": 37.0,
+      "completions/mean_terminated_length": 98.66666666666667,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.5,
+      "format_failures": 0.0,
+      "grad_norm": 11.985437393188477,
+      "kl": 2.047822058200836,
+      "learning_rate": 1e-06,
+      "loss": -0.0949,
+      "num_tokens": 1294032.0,
+      "reward": 0.625,
+      "reward_std": 0.5175491571426392,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 226.0,
+      "completions/max_terminated_length": 226.0,
+      "completions/mean_length": 139.5,
+      "completions/mean_terminated_length": 223.2,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.504,
+      "format_failures": 0.0,
+      "grad_norm": 0.06410921365022659,
+      "kl": 0.024711698293685913,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 1302656.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 134.0,
+      "completions/max_terminated_length": 134.0,
+      "completions/mean_length": 50.25,
+      "completions/mean_terminated_length": 134.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 134.0,
+      "epoch": 0.508,
+      "format_failures": 0.0,
+      "grad_norm": 5.750870704650879,
+      "kl": 0.32033737003803253,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 1311200.0,
+      "reward": 0.78125,
+      "reward_std": 0.33905068039894104,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 187.0,
+      "completions/max_terminated_length": 187.0,
+      "completions/mean_length": 140.0,
+      "completions/mean_terminated_length": 160.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.512,
+      "format_failures": 0.0,
+      "grad_norm": 2.3594982624053955,
+      "kl": 0.27750419452786446,
+      "learning_rate": 1e-06,
+      "loss": 0.1238,
+      "num_tokens": 1319216.0,
+      "reward": 0.6676406860351562,
+      "reward_std": 0.22850170731544495,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 154.0,
+      "completions/max_terminated_length": 154.0,
+      "completions/mean_length": 109.625,
+      "completions/mean_terminated_length": 125.28571428571429,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.516,
+      "format_failures": 0.0,
+      "grad_norm": 0.7231677174568176,
+      "kl": 0.06682828813791275,
+      "learning_rate": 1e-06,
+      "loss": 0.0391,
+      "num_tokens": 1325216.0,
+      "reward": 0.6453869342803955,
+      "reward_std": 0.17804734408855438,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 131.0,
+      "completions/max_terminated_length": 131.0,
+      "completions/mean_length": 87.25,
+      "completions/mean_terminated_length": 99.71428571428571,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 79.0,
+      "epoch": 0.52,
+      "format_failures": 0.0,
+      "grad_norm": 5.297896385192871,
+      "kl": 0.6651033144444227,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 1330968.0,
+      "reward": 0.5601190328598022,
+      "reward_std": 0.13645371794700623,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 311.0,
+      "completions/max_terminated_length": 311.0,
+      "completions/mean_length": 77.25,
+      "completions/mean_terminated_length": 309.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 307.0,
+      "epoch": 0.524,
+      "format_failures": 0.0,
+      "grad_norm": 0.8595375418663025,
+      "kl": 0.06659615971148014,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 1339728.0,
+      "reward": 0.9583333730697632,
+      "reward_std": 0.11785111576318741,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 175.0,
+      "completions/max_terminated_length": 175.0,
+      "completions/mean_length": 41.875,
+      "completions/mean_terminated_length": 167.5,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.528,
+      "format_failures": 0.0,
+      "grad_norm": 347.5839538574219,
+      "kl": 61.65154816582799,
+      "learning_rate": 1e-06,
+      "loss": 1.1776,
+      "num_tokens": 1346528.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 425.0,
+      "completions/max_terminated_length": 425.0,
+      "completions/mean_length": 296.125,
+      "completions/mean_terminated_length": 338.42857142857144,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.532,
+      "format_failures": 0.0,
+      "grad_norm": 0.43614092469215393,
+      "kl": 0.10557529516518116,
+      "learning_rate": 1e-06,
+      "loss": -0.0366,
+      "num_tokens": 1355920.0,
+      "reward": 0.5107142925262451,
+      "reward_std": 0.21715763211250305,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 257.0,
+      "completions/max_terminated_length": 257.0,
+      "completions/mean_length": 149.625,
+      "completions/mean_terminated_length": 171.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 116.0,
+      "epoch": 0.536,
+      "format_failures": 0.0,
+      "grad_norm": 2.284273624420166,
+      "kl": 0.06671860627830029,
+      "learning_rate": 1e-06,
+      "loss": 0.0794,
+      "num_tokens": 1363104.0,
+      "reward": 0.663690447807312,
+      "reward_std": 0.2778385877609253,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 358.0,
+      "completions/max_terminated_length": 358.0,
+      "completions/mean_length": 215.625,
+      "completions/mean_terminated_length": 246.42857142857142,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.54,
+      "format_failures": 0.0,
+      "grad_norm": 0.8071838021278381,
+      "kl": 0.046601174399256706,
+      "learning_rate": 1e-06,
+      "loss": -0.0342,
+      "num_tokens": 1370928.0,
+      "reward": 0.6100694537162781,
+      "reward_std": 0.3873949348926544,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 228.0,
+      "completions/max_terminated_length": 228.0,
+      "completions/mean_length": 85.5,
+      "completions/mean_terminated_length": 228.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 0.544,
+      "format_failures": 0.0,
+      "grad_norm": 0.14378634095191956,
+      "kl": 0.04612975288182497,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 1380216.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 228.0,
+      "completions/max_terminated_length": 228.0,
+      "completions/mean_length": 137.0,
+      "completions/mean_terminated_length": 156.57142857142858,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.548,
+      "format_failures": 0.0,
+      "grad_norm": 0.8862031698226929,
+      "kl": 0.07590018585324287,
+      "learning_rate": 1e-06,
+      "loss": -0.0377,
+      "num_tokens": 1386992.0,
+      "reward": 0.5406250357627869,
+      "reward_std": 0.23373425006866455,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 148.0,
+      "completions/max_terminated_length": 148.0,
+      "completions/mean_length": 73.5,
+      "completions/mean_terminated_length": 84.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 58.0,
+      "epoch": 0.552,
+      "format_failures": 0.0,
+      "grad_norm": 813.8618774414062,
+      "kl": 83.35732051730156,
+      "learning_rate": 1e-06,
+      "loss": 1.2934,
+      "num_tokens": 1396808.0,
+      "reward": 0.4833333492279053,
+      "reward_std": 0.4804098606109619,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 181.0,
+      "completions/max_terminated_length": 181.0,
+      "completions/mean_length": 68.875,
+      "completions/mean_terminated_length": 137.75,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.556,
+      "format_failures": 0.0,
+      "grad_norm": 2.3204505443573,
+      "kl": 0.11221980676054955,
+      "learning_rate": 1e-06,
+      "loss": -0.2757,
+      "num_tokens": 1405448.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 55.0,
+      "completions/max_terminated_length": 55.0,
+      "completions/mean_length": 20.625,
+      "completions/mean_terminated_length": 55.0,
+      "completions/min_length": 0.0,
+      "completions/min_terminated_length": 55.0,
+      "epoch": 0.56,
+      "format_failures": 0.0,
+      "grad_norm": 0.12126144766807556,
+      "kl": 0.013866727240383625,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 1413312.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "step": 140
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 1413312,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}