Model save

Browse files

Files changed (4) hide show

README.md +1 -1
all_results.json +4 -4
train_results.json +4 -4
trainer_state.json +281 -281

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/atutej/huggingface/runs/4e5ri02h)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/atutej/huggingface/runs/ek4mz2ft)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.04260323340275202,
-    "train_runtime": 3381.4576,
-    "train_samples_per_second": 0.296,
-    "train_steps_per_second": 0.049
 }

 {
     "total_flos": 0.0,
+    "train_loss": -0.017725162387612355,
+    "train_runtime": 3869.3247,
+    "train_samples_per_second": 0.258,
+    "train_steps_per_second": 0.043
 }

train_results.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.04260323340275202,
-    "train_runtime": 3381.4576,
-    "train_samples_per_second": 0.296,
-    "train_steps_per_second": 0.049
 }

 {
     "total_flos": 0.0,
+    "train_loss": -0.017725162387612355,
+    "train_runtime": 3869.3247,
+    "train_samples_per_second": 0.258,
+    "train_steps_per_second": 0.043
 }

trainer_state.json CHANGED Viewed

@@ -16,25 +16,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.002083333333333337,
-      "completions/max_length": 728.8,
-      "completions/max_terminated_length": 700.9,
-      "completions/mean_length": 237.7083427429199,
-      "completions/mean_terminated_length": 236.154793548584,
-      "completions/min_length": 55.3,
-      "completions/min_terminated_length": 55.3,
       "epoch": 0.05997001499250375,
-      "frac_reward_zero_std": 0.01666666716337204,
-      "grad_norm": 0.453125,
-      "kl": 0.05377130508422852,
       "learning_rate": 1.9855293386108995e-05,
-      "loss": 0.0039,
-      "num_tokens": 232036.0,
-      "reward": 0.9150718182325364,
-      "reward_std": 0.31903862953186035,
-      "rewards/_accuracy_reward/mean": 0.29215513318777087,
-      "rewards/_accuracy_reward/std": 0.20658667236566544,
-      "rewards/_format_reward/mean": 0.6229166716337204,
-      "rewards/_format_reward/std": 0.28700760900974276,
       "step": 10
     },
     {
@@ -44,25 +44,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 706.2,
-      "completions/max_terminated_length": 706.2,
-      "completions/mean_length": 312.7083450317383,
-      "completions/mean_terminated_length": 312.7083450317383,
-      "completions/min_length": 92.1,
-      "completions/min_terminated_length": 92.1,
       "epoch": 0.1199400299850075,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.326171875,
-      "kl": 0.069390869140625,
       "learning_rate": 1.936044737814273e-05,
-      "loss": 0.0615,
-      "num_tokens": 500288.0,
-      "reward": 1.3227388501167296,
-      "reward_std": 0.23705578446388245,
-      "rewards/_accuracy_reward/mean": 0.35398882925510405,
-      "rewards/_accuracy_reward/std": 0.18871113955974578,
-      "rewards/_format_reward/mean": 0.96875,
-      "rewards/_format_reward/std": 0.1373054191470146,
       "step": 20
     },
     {
@@ -72,25 +72,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 639.0,
-      "completions/max_terminated_length": 639.0,
-      "completions/mean_length": 240.73542327880858,
-      "completions/mean_terminated_length": 240.73542327880858,
-      "completions/min_length": 75.9,
-      "completions/min_terminated_length": 75.9,
       "epoch": 0.17991004497751126,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.57421875,
-      "kl": 0.099114990234375,
       "learning_rate": 1.8531342035272768e-05,
-      "loss": 0.0922,
-      "num_tokens": 734305.0,
-      "reward": 1.4282155513763428,
-      "reward_std": 0.16938417106866838,
-      "rewards/_accuracy_reward/mean": 0.43863220810890197,
-      "rewards/_accuracy_reward/std": 0.1596881665289402,
-      "rewards/_format_reward/mean": 0.9895833373069763,
-      "rewards/_format_reward/std": 0.06349536329507828,
       "step": 30
     },
     {
@@ -100,25 +100,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 322.9,
-      "completions/max_terminated_length": 322.9,
-      "completions/mean_length": 115.66041946411133,
-      "completions/mean_terminated_length": 115.66041946411133,
-      "completions/min_length": 58.9,
-      "completions/min_terminated_length": 58.9,
       "epoch": 0.239880059970015,
       "frac_reward_zero_std": 0.0,
       "grad_norm": 0.63671875,
-      "kl": 0.198193359375,
       "learning_rate": 1.7397584510798208e-05,
-      "loss": 0.0363,
-      "num_tokens": 907950.0,
-      "reward": 1.4457251667976379,
-      "reward_std": 0.1396712526679039,
-      "rewards/_accuracy_reward/mean": 0.4478084534406662,
-      "rewards/_accuracy_reward/std": 0.1528099738061428,
-      "rewards/_format_reward/mean": 0.9979166686534882,
-      "rewards/_format_reward/std": 0.014433756470680237,
       "step": 40
     },
     {
@@ -128,24 +128,24 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 344.4,
-      "completions/max_terminated_length": 344.4,
-      "completions/mean_length": 133.3541702270508,
-      "completions/mean_terminated_length": 133.3541702270508,
-      "completions/min_length": 68.4,
-      "completions/min_terminated_length": 68.4,
       "epoch": 0.29985007496251875,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.55859375,
-      "kl": 0.16903076171875,
       "learning_rate": 1.5999661014486956e-05,
-      "loss": 0.0222,
-      "num_tokens": 1090352.0,
-      "reward": 1.477199375629425,
-      "reward_std": 0.12433719113469124,
-      "rewards/_accuracy_reward/mean": 0.4771993726491928,
-      "rewards/_accuracy_reward/std": 0.14259819611907004,
-      "rewards/_format_reward/mean": 1.0,
       "rewards/_format_reward/std": 0.0,
       "step": 50
     },
@@ -155,26 +155,26 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.0,
-      "completions/max_length": 441.4,
-      "completions/max_terminated_length": 441.4,
-      "completions/mean_length": 162.48541870117188,
-      "completions/mean_terminated_length": 162.48541870117188,
-      "completions/min_length": 73.3,
-      "completions/min_terminated_length": 73.3,
       "epoch": 0.3598200899550225,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.51953125,
-      "kl": 0.134783935546875,
       "learning_rate": 1.4387491059717653e-05,
-      "loss": 0.034,
-      "num_tokens": 1286305.0,
-      "reward": 1.4927098155021667,
-      "reward_std": 0.1404846042394638,
-      "rewards/_accuracy_reward/mean": 0.494793102145195,
-      "rewards/_accuracy_reward/std": 0.15004281625151633,
-      "rewards/_format_reward/mean": 0.9979166686534882,
-      "rewards/_format_reward/std": 0.014433756470680237,
       "step": 60
     },
     {
@@ -183,26 +183,26 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.002083333333333337,
-      "completions/max_length": 528.9,
-      "completions/max_terminated_length": 504.8,
-      "completions/mean_length": 158.65208740234374,
-      "completions/mean_terminated_length": 156.81835327148437,
-      "completions/min_length": 67.8,
-      "completions/min_terminated_length": 67.8,
       "epoch": 0.4197901049475262,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.49609375,
-      "kl": 0.13594970703125,
       "learning_rate": 1.2618644849608068e-05,
-      "loss": 0.0934,
-      "num_tokens": 1480586.0,
-      "reward": 1.4793070673942565,
-      "reward_std": 0.14201814979314803,
-      "rewards/_accuracy_reward/mean": 0.4897237092256546,
-      "rewards/_accuracy_reward/std": 0.14367412701249122,
-      "rewards/_format_reward/mean": 0.9895833373069763,
-      "rewards/_format_reward/std": 0.06349536329507828,
       "step": 70
     },
     {
@@ -212,25 +212,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 451.0,
-      "completions/max_terminated_length": 451.0,
-      "completions/mean_length": 140.0562545776367,
-      "completions/mean_terminated_length": 140.0562545776367,
-      "completions/min_length": 66.8,
-      "completions/min_terminated_length": 66.8,
       "epoch": 0.47976011994003,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.494140625,
-      "kl": 0.13839111328125,
       "learning_rate": 1.075628745884457e-05,
-      "loss": 0.0383,
-      "num_tokens": 1665965.0,
-      "reward": 1.5421025276184082,
-      "reward_std": 0.15527970492839813,
-      "rewards/_accuracy_reward/mean": 0.550435796380043,
-      "rewards/_accuracy_reward/std": 0.14859750047326087,
-      "rewards/_format_reward/mean": 0.9916666746139526,
-      "rewards/_format_reward/std": 0.05773502588272095,
       "step": 80
     },
     {
@@ -240,25 +240,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 357.0,
-      "completions/max_terminated_length": 357.0,
-      "completions/mean_length": 139.4875045776367,
-      "completions/mean_terminated_length": 139.4875045776367,
-      "completions/min_length": 65.1,
-      "completions/min_terminated_length": 65.1,
       "epoch": 0.5397301349325337,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.458984375,
-      "kl": 0.109674072265625,
       "learning_rate": 8.866923223987303e-06,
-      "loss": 0.0515,
-      "num_tokens": 1850903.0,
-      "reward": 1.4895017743110657,
-      "reward_std": 0.1271946720778942,
-      "rewards/_accuracy_reward/mean": 0.49366843402385713,
-      "rewards/_accuracy_reward/std": 0.14131565093994142,
-      "rewards/_format_reward/mean": 0.9958333373069763,
-      "rewards/_format_reward/std": 0.028867512941360474,
       "step": 90
     },
     {
@@ -268,25 +268,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 409.8,
-      "completions/max_terminated_length": 409.8,
-      "completions/mean_length": 147.96250610351564,
-      "completions/mean_terminated_length": 147.96250610351564,
-      "completions/min_length": 73.5,
-      "completions/min_terminated_length": 73.5,
       "epoch": 0.5997001499250375,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.515625,
-      "kl": 0.10118408203125,
       "learning_rate": 7.018020889533348e-06,
-      "loss": 0.0394,
-      "num_tokens": 2039933.0,
-      "reward": 1.497954213619232,
-      "reward_std": 0.1677745535969734,
-      "rewards/_accuracy_reward/mean": 0.5062875241041184,
-      "rewards/_accuracy_reward/std": 0.1678355909883976,
-      "rewards/_format_reward/mean": 0.9916666746139526,
-      "rewards/_format_reward/std": 0.05773502588272095,
       "step": 100
     },
     {
@@ -296,25 +296,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 390.7,
-      "completions/max_terminated_length": 390.7,
-      "completions/mean_length": 137.69375457763672,
-      "completions/mean_terminated_length": 137.69375457763672,
-      "completions/min_length": 64.9,
-      "completions/min_terminated_length": 64.9,
       "epoch": 0.6596701649175413,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.5234375,
-      "kl": 0.090032958984375,
       "learning_rate": 5.2756043152032934e-06,
-      "loss": 0.0155,
-      "num_tokens": 2224082.0,
-      "reward": 1.49248765707016,
-      "reward_std": 0.14827005118131636,
-      "rewards/_accuracy_reward/mean": 0.49873766899108884,
-      "rewards/_accuracy_reward/std": 0.16313461735844612,
-      "rewards/_format_reward/mean": 0.9937500059604645,
-      "rewards/_format_reward/std": 0.04330126941204071,
       "step": 110
     },
     {
@@ -323,26 +323,26 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.0,
-      "completions/max_length": 396.7,
-      "completions/max_terminated_length": 396.7,
-      "completions/mean_length": 147.02917022705077,
-      "completions/mean_terminated_length": 147.02917022705077,
-      "completions/min_length": 66.4,
-      "completions/min_terminated_length": 66.4,
       "epoch": 0.719640179910045,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.40234375,
-      "kl": 0.082232666015625,
       "learning_rate": 3.7018947797172864e-06,
-      "loss": 0.0188,
-      "num_tokens": 2412640.0,
-      "reward": 1.5127413272857666,
-      "reward_std": 0.1542006738483906,
-      "rewards/_accuracy_reward/mean": 0.5210746347904205,
-      "rewards/_accuracy_reward/std": 0.16292970031499862,
-      "rewards/_format_reward/mean": 0.9916666746139526,
-      "rewards/_format_reward/std": 0.05773502588272095,
       "step": 120
     },
     {
@@ -352,25 +352,25 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 439.5,
-      "completions/max_terminated_length": 439.5,
-      "completions/mean_length": 151.95000457763672,
-      "completions/mean_terminated_length": 151.95000457763672,
-      "completions/min_length": 58.6,
-      "completions/min_terminated_length": 58.6,
       "epoch": 0.7796101949025487,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.4765625,
-      "kl": 0.08944091796875,
       "learning_rate": 2.353089073828255e-06,
-      "loss": 0.0402,
-      "num_tokens": 2604064.0,
-      "reward": 1.5172916531562806,
-      "reward_std": 0.13390944600105287,
-      "rewards/_accuracy_reward/mean": 0.5214582800865173,
-      "rewards/_accuracy_reward/std": 0.14149208813905717,
-      "rewards/_format_reward/mean": 0.9958333373069763,
-      "rewards/_format_reward/std": 0.028867512941360474,
       "step": 130
     },
     {
@@ -379,26 +379,26 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.0,
-      "completions/max_length": 459.0,
-      "completions/max_terminated_length": 459.0,
-      "completions/mean_length": 143.82083587646486,
-      "completions/mean_terminated_length": 143.82083587646486,
-      "completions/min_length": 57.1,
-      "completions/min_terminated_length": 57.1,
       "epoch": 0.8395802098950524,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.486328125,
-      "kl": 0.09305419921875,
       "learning_rate": 1.2773527263780626e-06,
-      "loss": 0.066,
-      "num_tokens": 2791442.0,
-      "reward": 1.4838216304779053,
-      "reward_std": 0.1543775752186775,
-      "rewards/_accuracy_reward/mean": 0.4942382574081421,
-      "rewards/_accuracy_reward/std": 0.1497928135097027,
-      "rewards/_format_reward/mean": 0.9895833373069763,
-      "rewards/_format_reward/std": 0.06349536329507828,
       "step": 140
     },
     {
@@ -407,25 +407,25 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.0,
-      "completions/max_length": 428.9,
-      "completions/max_terminated_length": 428.9,
-      "completions/mean_length": 145.2979248046875,
-      "completions/mean_terminated_length": 145.2979248046875,
-      "completions/min_length": 57.3,
-      "completions/min_terminated_length": 57.3,
       "epoch": 0.8995502248875562,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.484375,
-      "kl": 0.106024169921875,
       "learning_rate": 5.131000247938367e-07,
-      "loss": 0.0329,
-      "num_tokens": 2979625.0,
-      "reward": 1.534533643722534,
-      "reward_std": 0.1303482674062252,
-      "rewards/_accuracy_reward/mean": 0.5345336198806763,
-      "rewards/_accuracy_reward/std": 0.1466048091650009,
-      "rewards/_format_reward/mean": 1.0,
       "rewards/_format_reward/std": 0.0,
       "step": 150
     },
@@ -435,26 +435,26 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.0,
-      "completions/max_length": 383.6,
-      "completions/max_terminated_length": 383.6,
-      "completions/mean_length": 150.10000457763672,
-      "completions/mean_terminated_length": 150.10000457763672,
-      "completions/min_length": 58.6,
-      "completions/min_terminated_length": 58.6,
       "epoch": 0.95952023988006,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.046875,
-      "kl": 0.114117431640625,
       "learning_rate": 8.762225008062675e-08,
-      "loss": 0.0142,
-      "num_tokens": 3169825.0,
-      "reward": 1.4770540237426757,
-      "reward_std": 0.1398945815861225,
-      "rewards/_accuracy_reward/mean": 0.4812206119298935,
-      "rewards/_accuracy_reward/std": 0.1583762623369694,
-      "rewards/_format_reward/mean": 0.9958333373069763,
-      "rewards/_format_reward/std": 0.028867512941360474,
       "step": 160
     },
     {
@@ -464,33 +464,33 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 433.5,
-      "completions/max_terminated_length": 433.5,
-      "completions/mean_length": 161.9201431274414,
-      "completions/mean_terminated_length": 161.9201431274414,
-      "completions/min_length": 59.166666666666664,
-      "completions/min_terminated_length": 59.166666666666664,
       "epoch": 0.9955022488755623,
       "frac_reward_zero_std": 0.0,
-      "kl": 0.0867919921875,
-      "num_tokens": 3287402.0,
-      "reward": 1.5356033047040303,
-      "reward_std": 0.14081149299939474,
-      "rewards/_accuracy_reward/mean": 0.5460198571284612,
-      "rewards/_accuracy_reward/std": 0.13319105903307596,
-      "rewards/_format_reward/mean": 0.9895833333333334,
-      "rewards/_format_reward/std": 0.05771308392286301,
       "step": 166,
       "total_flos": 0.0,
-      "train_loss": 0.04260323340275202,
-      "train_runtime": 3381.4576,
-      "train_samples_per_second": 0.296,
-      "train_steps_per_second": 0.049
     }
   ],
   "logging_steps": 10,
   "max_steps": 166,
-  "num_input_tokens_seen": 3287402,
   "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {

       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.002083333333333337,
+      "completions/max_length": 414.3,
+      "completions/max_terminated_length": 390.7,
+      "completions/mean_length": 96.92292022705078,
+      "completions/mean_terminated_length": 95.08196258544922,
+      "completions/min_length": 43.4,
+      "completions/min_terminated_length": 43.4,
       "epoch": 0.05997001499250375,
+      "frac_reward_zero_std": 0.10000000149011612,
+      "grad_norm": 0.875,
+      "kl": 0.04828977584838867,
       "learning_rate": 1.9855293386108995e-05,
+      "loss": -0.0894,
+      "num_tokens": 164459.0,
+      "reward": 0.25649446398019793,
+      "reward_std": 0.12709882631897926,
+      "rewards/_accuracy_reward/mean": 0.25024444460868833,
+      "rewards/_accuracy_reward/std": 0.16934245973825454,
+      "rewards/_format_reward/mean": 0.00625,
+      "rewards/_format_reward/std": 0.02446230351924896,
       "step": 10
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 405.9,
+      "completions/max_terminated_length": 405.9,
+      "completions/mean_length": 115.59583740234375,
+      "completions/mean_terminated_length": 115.59583740234375,
+      "completions/min_length": 65.6,
+      "completions/min_terminated_length": 65.6,
       "epoch": 0.1199400299850075,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.8359375,
+      "kl": 0.0818756103515625,
       "learning_rate": 1.936044737814273e-05,
+      "loss": 0.0581,
+      "num_tokens": 338097.0,
+      "reward": 0.36916170418262484,
+      "reward_std": 0.13012803941965104,
+      "rewards/_accuracy_reward/mean": 0.36916169822216033,
+      "rewards/_accuracy_reward/std": 0.1557157054543495,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 20
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 269.2,
+      "completions/max_terminated_length": 269.2,
+      "completions/mean_length": 85.4083351135254,
+      "completions/mean_terminated_length": 85.4083351135254,
+      "completions/min_length": 57.7,
+      "completions/min_terminated_length": 57.7,
       "epoch": 0.17991004497751126,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6484375,
+      "kl": 0.102984619140625,
       "learning_rate": 1.8531342035272768e-05,
+      "loss": 0.0225,
+      "num_tokens": 497557.0,
+      "reward": 0.4350260511040688,
+      "reward_std": 0.14344265162944794,
+      "rewards/_accuracy_reward/mean": 0.43502604514360427,
+      "rewards/_accuracy_reward/std": 0.1680240161716938,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 30
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 198.7,
+      "completions/max_terminated_length": 198.7,
+      "completions/mean_length": 76.06458435058593,
+      "completions/mean_terminated_length": 76.06458435058593,
+      "completions/min_length": 44.7,
+      "completions/min_terminated_length": 44.7,
       "epoch": 0.239880059970015,
       "frac_reward_zero_std": 0.0,
       "grad_norm": 0.63671875,
+      "kl": 0.11517333984375,
       "learning_rate": 1.7397584510798208e-05,
+      "loss": 0.0064,
+      "num_tokens": 652196.0,
+      "reward": 0.4085416719317436,
+      "reward_std": 0.13231892064213752,
+      "rewards/_accuracy_reward/mean": 0.4085416689515114,
+      "rewards/_accuracy_reward/std": 0.1608368895947933,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 40
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 265.6,
+      "completions/max_terminated_length": 265.6,
+      "completions/mean_length": 89.46666946411133,
+      "completions/mean_terminated_length": 89.46666946411133,
+      "completions/min_length": 62.4,
+      "completions/min_terminated_length": 62.4,
       "epoch": 0.29985007496251875,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.392578125,
+      "kl": 0.118402099609375,
       "learning_rate": 1.5999661014486956e-05,
+      "loss": 0.0506,
+      "num_tokens": 813532.0,
+      "reward": 0.5038281351327896,
+      "reward_std": 0.12968316152691842,
+      "rewards/_accuracy_reward/mean": 0.5038281202316284,
+      "rewards/_accuracy_reward/std": 0.16127740293741227,
+      "rewards/_format_reward/mean": 0.0,
       "rewards/_format_reward/std": 0.0,
       "step": 50
     },
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.002083333333333337,
+      "completions/max_length": 364.6,
+      "completions/max_terminated_length": 304.4,
+      "completions/mean_length": 93.76042022705079,
+      "completions/mean_terminated_length": 91.8341781616211,
+      "completions/min_length": 62.7,
+      "completions/min_terminated_length": 62.7,
       "epoch": 0.3598200899550225,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.455078125,
+      "kl": 0.288519287109375,
       "learning_rate": 1.4387491059717653e-05,
+      "loss": 0.0338,
+      "num_tokens": 976497.0,
+      "reward": 0.4925865650177002,
+      "reward_std": 0.12617484703660012,
+      "rewards/_accuracy_reward/mean": 0.492586562037468,
+      "rewards/_accuracy_reward/std": 0.15444535091519357,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 60
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 265.3,
+      "completions/max_terminated_length": 265.3,
+      "completions/mean_length": 85.07291946411132,
+      "completions/mean_terminated_length": 85.07291946411132,
+      "completions/min_length": 70.0,
+      "completions/min_terminated_length": 70.0,
       "epoch": 0.4197901049475262,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.451171875,
+      "kl": 0.089068603515625,
       "learning_rate": 1.2618644849608068e-05,
+      "loss": 0.0247,
+      "num_tokens": 1135460.0,
+      "reward": 0.4750963538885117,
+      "reward_std": 0.12951767966151237,
+      "rewards/_accuracy_reward/mean": 0.4750963240861893,
+      "rewards/_accuracy_reward/std": 0.1608477719128132,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 70
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 284.3,
+      "completions/max_terminated_length": 284.3,
+      "completions/mean_length": 87.73958587646484,
+      "completions/mean_terminated_length": 87.73958587646484,
+      "completions/min_length": 69.4,
+      "completions/min_terminated_length": 69.4,
       "epoch": 0.47976011994003,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4140625,
+      "kl": 0.07869873046875,
       "learning_rate": 1.075628745884457e-05,
+      "loss": 0.0442,
+      "num_tokens": 1295727.0,
+      "reward": 0.5100168704986572,
+      "reward_std": 0.15252956375479698,
+      "rewards/_accuracy_reward/mean": 0.5100168436765671,
+      "rewards/_accuracy_reward/std": 0.17259212732315063,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 80
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 259.8,
+      "completions/max_terminated_length": 259.8,
+      "completions/mean_length": 88.41250228881836,
+      "completions/mean_terminated_length": 88.41250228881836,
+      "completions/min_length": 64.1,
+      "completions/min_terminated_length": 64.1,
       "epoch": 0.5397301349325337,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.66796875,
+      "kl": 0.17242431640625,
       "learning_rate": 8.866923223987303e-06,
+      "loss": 0.027,
+      "num_tokens": 1456149.0,
+      "reward": 0.4691749334335327,
+      "reward_std": 0.11657274290919303,
+      "rewards/_accuracy_reward/mean": 0.4691749155521393,
+      "rewards/_accuracy_reward/std": 0.13906535133719444,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 90
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 338.8,
+      "completions/max_terminated_length": 338.8,
+      "completions/mean_length": 94.3062530517578,
+      "completions/mean_terminated_length": 94.3062530517578,
+      "completions/min_length": 59.2,
+      "completions/min_terminated_length": 59.2,
       "epoch": 0.5997001499250375,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.59375,
+      "kl": 0.106280517578125,
       "learning_rate": 7.018020889533348e-06,
+      "loss": -0.0246,
+      "num_tokens": 1619424.0,
+      "reward": 0.5008379817008972,
+      "reward_std": 0.1251222789287567,
+      "rewards/_accuracy_reward/mean": 0.5008379787206649,
+      "rewards/_accuracy_reward/std": 0.14559592306613922,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 100
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 404.7,
+      "completions/max_terminated_length": 404.7,
+      "completions/mean_length": 115.0875030517578,
+      "completions/mean_terminated_length": 115.0875030517578,
+      "completions/min_length": 51.4,
+      "completions/min_terminated_length": 51.4,
       "epoch": 0.6596701649175413,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.353515625,
+      "kl": 0.0727294921875,
       "learning_rate": 5.2756043152032934e-06,
+      "loss": -0.0641,
+      "num_tokens": 1792722.0,
+      "reward": 0.4739863067865372,
+      "reward_std": 0.13204658553004264,
+      "rewards/_accuracy_reward/mean": 0.47398627996444703,
+      "rewards/_accuracy_reward/std": 0.15842494517564773,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 110
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.002083333333333337,
+      "completions/max_length": 517.4,
+      "completions/max_terminated_length": 435.3,
+      "completions/mean_length": 136.24375381469727,
+      "completions/mean_terminated_length": 134.30962219238282,
+      "completions/min_length": 51.6,
+      "completions/min_terminated_length": 51.6,
       "epoch": 0.719640179910045,
+      "frac_reward_zero_std": 0.01666666716337204,
+      "grad_norm": 0.279296875,
+      "kl": 0.0670654296875,
       "learning_rate": 3.7018947797172864e-06,
+      "loss": -0.0695,
+      "num_tokens": 1976103.0,
+      "reward": 0.5253316760063171,
+      "reward_std": 0.12880267389118671,
+      "rewards/_accuracy_reward/mean": 0.5253316521644592,
+      "rewards/_accuracy_reward/std": 0.17076537311077117,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 120
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 522.1,
+      "completions/max_terminated_length": 522.1,
+      "completions/mean_length": 152.3375045776367,
+      "completions/mean_terminated_length": 152.3375045776367,
+      "completions/min_length": 46.7,
+      "completions/min_terminated_length": 46.7,
       "epoch": 0.7796101949025487,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.298828125,
+      "kl": 0.063446044921875,
       "learning_rate": 2.353089073828255e-06,
+      "loss": -0.0654,
+      "num_tokens": 2167713.0,
+      "reward": 0.48647034764289854,
+      "reward_std": 0.1461639277637005,
+      "rewards/_accuracy_reward/mean": 0.48647033274173734,
+      "rewards/_accuracy_reward/std": 0.17206955328583717,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 130
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.002083333333333337,
+      "completions/max_length": 562.7,
+      "completions/max_terminated_length": 500.5,
+      "completions/mean_length": 158.33542022705078,
+      "completions/mean_terminated_length": 156.5459243774414,
+      "completions/min_length": 50.7,
+      "completions/min_terminated_length": 50.7,
       "epoch": 0.8395802098950524,
+      "frac_reward_zero_std": 0.03333333432674408,
+      "grad_norm": 0.5,
+      "kl": 0.06429443359375,
       "learning_rate": 1.2773527263780626e-06,
+      "loss": -0.0735,
+      "num_tokens": 2362058.0,
+      "reward": 0.4694186806678772,
+      "reward_std": 0.13372117429971694,
+      "rewards/_accuracy_reward/mean": 0.46941866278648375,
+      "rewards/_accuracy_reward/std": 0.1658677004277706,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 140
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.004166666666666674,
+      "completions/max_length": 636.7,
+      "completions/max_terminated_length": 532.8,
+      "completions/mean_length": 168.87708892822266,
+      "completions/mean_terminated_length": 165.28546447753905,
+      "completions/min_length": 42.2,
+      "completions/min_terminated_length": 42.2,
       "epoch": 0.8995502248875562,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.232421875,
+      "kl": 0.085772705078125,
       "learning_rate": 5.131000247938367e-07,
+      "loss": -0.074,
+      "num_tokens": 2561559.0,
+      "reward": 0.5166842222213746,
+      "reward_std": 0.15563009977340697,
+      "rewards/_accuracy_reward/mean": 0.5166841924190522,
+      "rewards/_accuracy_reward/std": 0.1897404298186302,
+      "rewards/_format_reward/mean": 0.0,
       "rewards/_format_reward/std": 0.0,
       "step": 150
     },
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.004166666666666674,
+      "completions/max_length": 578.3,
+      "completions/max_terminated_length": 469.4,
+      "completions/mean_length": 176.32083740234376,
+      "completions/mean_terminated_length": 172.7707046508789,
+      "completions/min_length": 57.6,
+      "completions/min_terminated_length": 57.6,
       "epoch": 0.95952023988006,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.310546875,
+      "kl": 0.0620758056640625,
       "learning_rate": 8.762225008062675e-08,
+      "loss": -0.0572,
+      "num_tokens": 2764345.0,
+      "reward": 0.4916923582553864,
+      "reward_std": 0.14141732677817345,
+      "rewards/_accuracy_reward/mean": 0.49169233739376067,
+      "rewards/_accuracy_reward/std": 0.17603871822357178,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 160
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 447.6666666666667,
+      "completions/max_terminated_length": 447.6666666666667,
+      "completions/mean_length": 160.1840337117513,
+      "completions/mean_terminated_length": 160.1840337117513,
+      "completions/min_length": 49.333333333333336,
+      "completions/min_terminated_length": 49.333333333333336,
       "epoch": 0.9955022488755623,
       "frac_reward_zero_std": 0.0,
+      "kl": 0.069488525390625,
+      "num_tokens": 2881422.0,
+      "reward": 0.49811801811059314,
+      "reward_std": 0.12225283433993657,
+      "rewards/_accuracy_reward/mean": 0.4981180081764857,
+      "rewards/_accuracy_reward/std": 0.14967897906899452,
+      "rewards/_format_reward/mean": 0.0,
+      "rewards/_format_reward/std": 0.0,
       "step": 166,
       "total_flos": 0.0,
+      "train_loss": -0.017725162387612355,
+      "train_runtime": 3869.3247,
+      "train_samples_per_second": 0.258,
+      "train_steps_per_second": 0.043
     }
   ],
   "logging_steps": 10,
   "max_steps": 166,
+  "num_input_tokens_seen": 2881422,
   "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {