YangZhoumill
/

Qwen2.5-1.5B-Open-R1-Distill

@@ -1,11 +1,9 @@
 ---
 base_model: Qwen/Qwen2.5-0.5B-Instruct
-datasets: YangZhoumill/post_v_1
 library_name: transformers
 model_name: Qwen2.5-1.5B-Open-R1-Distill
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - sft
 licence: license
@@ -13,7 +11,7 @@ licence: license
 # Model Card for Qwen2.5-1.5B-Open-R1-Distill
-This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) on the [YangZhoumill/post_v_1](https://huggingface.co/datasets/YangZhoumill/post_v_1) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -22,14 +20,14 @@ It has been trained using [TRL](https://github.com/huggingface/trl).
 from transformers import pipeline
 question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
-generator = pipeline("text-generation", model="ZMC2019/Qwen2.5-1.5B-Open-R1-Distill", device="cuda")
 output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
 print(output["generated_text"])
 ```
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/stevenzhou0816100/huggingface/runs/uy7nfosz)
 This model was trained with SFT.

 ---
 base_model: Qwen/Qwen2.5-0.5B-Instruct
 library_name: transformers
 model_name: Qwen2.5-1.5B-Open-R1-Distill
 tags:
 - generated_from_trainer
 - trl
 - sft
 licence: license
 # Model Card for Qwen2.5-1.5B-Open-R1-Distill
+This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 from transformers import pipeline
 question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="YangZhoumill/Qwen2.5-1.5B-Open-R1-Distill", device="cuda")
 output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
 print(output["generated_text"])
 ```
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/stevenzhou0816100/huggingface/runs/edxcy052)
 This model was trained with SFT.

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "total_flos": 1.355802795311104e+16,
-    "train_loss": 0.44515162458022434,
-    "train_runtime": 298.6439,
-    "train_samples": 8221,
-    "train_samples_per_second": 0.161,
-    "train_steps_per_second": 0.161
 }

 {
+    "total_flos": 4.404961752383488e+16,
+    "train_loss": 0.28405343046555154,
+    "train_runtime": 1315.0218,
+    "train_samples": 10016,
+    "train_samples_per_second": 0.989,
+    "train_steps_per_second": 0.989
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "total_flos": 1.355802795311104e+16,
-    "train_loss": 0.44515162458022434,
-    "train_runtime": 298.6439,
-    "train_samples": 8221,
-    "train_samples_per_second": 0.161,
-    "train_steps_per_second": 0.161
 }

 {
+    "total_flos": 4.404961752383488e+16,
+    "train_loss": 0.28405343046555154,
+    "train_runtime": 1315.0218,
+    "train_samples": 10016,
+    "train_samples_per_second": 0.989,
+    "train_steps_per_second": 0.989
 }

trainer_state.json CHANGED Viewed

@@ -3,86 +3,1843 @@
   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 48,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.10416666666666667,
-      "grad_norm": 10.75,
-      "learning_rate": 4.978103154668534e-05,
-      "loss": 0.9914,
       "step": 5
     },
     {
-      "epoch": 0.20833333333333334,
-      "grad_norm": 4.28125,
-      "learning_rate": 4.7366320839325864e-05,
-      "loss": 0.55,
       "step": 10
     },
     {
-      "epoch": 0.3125,
-      "grad_norm": 1.984375,
-      "learning_rate": 4.255543864307431e-05,
-      "loss": 0.4382,
       "step": 15
     },
     {
-      "epoch": 0.4166666666666667,
-      "grad_norm": 1.2578125,
-      "learning_rate": 3.5928648351858027e-05,
-      "loss": 0.3833,
       "step": 20
     },
     {
-      "epoch": 0.5208333333333334,
-      "grad_norm": 1.3515625,
-      "learning_rate": 2.828523867580628e-05,
-      "loss": 0.3536,
       "step": 25
     },
     {
-      "epoch": 0.625,
-      "grad_norm": 1.2734375,
-      "learning_rate": 2.054711762656369e-05,
-      "loss": 0.3548,
       "step": 30
     },
     {
-      "epoch": 0.7291666666666666,
-      "grad_norm": 1.1171875,
-      "learning_rate": 1.364761680517269e-05,
-      "loss": 0.3445,
       "step": 35
     },
     {
-      "epoch": 0.8333333333333334,
-      "grad_norm": 1.0078125,
-      "learning_rate": 8.418917836480417e-06,
-      "loss": 0.3189,
       "step": 40
     },
     {
-      "epoch": 0.9375,
-      "grad_norm": 0.8203125,
-      "learning_rate": 5.491678983489372e-06,
-      "loss": 0.3172,
       "step": 45
     },
     {
       "epoch": 1.0,
-      "step": 48,
-      "total_flos": 1.355802795311104e+16,
-      "train_loss": 0.44515162458022434,
-      "train_runtime": 298.6439,
-      "train_samples_per_second": 0.161,
-      "train_steps_per_second": 0.161
     }
   ],
   "logging_steps": 5,
-  "max_steps": 48,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
@@ -98,7 +1855,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.355802795311104e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 1300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0038461538461538464,
+      "grad_norm": 68.5,
+      "learning_rate": 3.846153846153847e-06,
+      "loss": 1.442,
       "step": 5
     },
     {
+      "epoch": 0.007692307692307693,
+      "grad_norm": 28.875,
+      "learning_rate": 7.692307692307694e-06,
+      "loss": 1.2205,
       "step": 10
     },
     {
+      "epoch": 0.011538461538461539,
+      "grad_norm": 14.6875,
+      "learning_rate": 1.153846153846154e-05,
+      "loss": 0.9428,
       "step": 15
     },
     {
+      "epoch": 0.015384615384615385,
+      "grad_norm": 9.375,
+      "learning_rate": 1.5384615384615387e-05,
+      "loss": 0.7695,
       "step": 20
     },
     {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 14528.0,
+      "learning_rate": 1.923076923076923e-05,
+      "loss": 0.7043,
       "step": 25
     },
     {
+      "epoch": 0.023076923076923078,
+      "grad_norm": 4.21875,
+      "learning_rate": 2.307692307692308e-05,
+      "loss": 0.6085,
       "step": 30
     },
     {
+      "epoch": 0.026923076923076925,
+      "grad_norm": 7.40625,
+      "learning_rate": 2.6923076923076923e-05,
+      "loss": 0.575,
       "step": 35
     },
     {
+      "epoch": 0.03076923076923077,
+      "grad_norm": 5.40625,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 0.5192,
       "step": 40
     },
     {
+      "epoch": 0.03461538461538462,
+      "grad_norm": 5.53125,
+      "learning_rate": 3.461538461538462e-05,
+      "loss": 0.5032,
       "step": 45
     },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 6.09375,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 0.4601,
+      "step": 50
+    },
+    {
+      "epoch": 0.04230769230769231,
+      "grad_norm": 3.9375,
+      "learning_rate": 4.230769230769231e-05,
+      "loss": 0.4175,
+      "step": 55
+    },
+    {
+      "epoch": 0.046153846153846156,
+      "grad_norm": 4.8125,
+      "learning_rate": 4.615384615384616e-05,
+      "loss": 0.4093,
+      "step": 60
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 4.9375,
+      "learning_rate": 5e-05,
+      "loss": 0.387,
+      "step": 65
+    },
+    {
+      "epoch": 0.05384615384615385,
+      "grad_norm": 4.78125,
+      "learning_rate": 4.99981800791248e-05,
+      "loss": 0.4415,
+      "step": 70
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 5.0,
+      "learning_rate": 4.9992720610909145e-05,
+      "loss": 0.4325,
+      "step": 75
+    },
+    {
+      "epoch": 0.06153846153846154,
+      "grad_norm": 4.15625,
+      "learning_rate": 4.998362247853528e-05,
+      "loss": 0.3797,
+      "step": 80
+    },
+    {
+      "epoch": 0.06538461538461539,
+      "grad_norm": 5.28125,
+      "learning_rate": 4.997088715381485e-05,
+      "loss": 0.3574,
+      "step": 85
+    },
+    {
+      "epoch": 0.06923076923076923,
+      "grad_norm": 3.78125,
+      "learning_rate": 4.995451669695081e-05,
+      "loss": 0.3803,
+      "step": 90
+    },
+    {
+      "epoch": 0.07307692307692308,
+      "grad_norm": 3.15625,
+      "learning_rate": 4.993451375620418e-05,
+      "loss": 0.3751,
+      "step": 95
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 2.640625,
+      "learning_rate": 4.991088156746554e-05,
+      "loss": 0.3477,
+      "step": 100
+    },
+    {
+      "epoch": 0.08076923076923077,
+      "grad_norm": 2.4375,
+      "learning_rate": 4.988362395373167e-05,
+      "loss": 0.2947,
+      "step": 105
+    },
+    {
+      "epoch": 0.08461538461538462,
+      "grad_norm": 4.21875,
+      "learning_rate": 4.985274532448704e-05,
+      "loss": 0.3417,
+      "step": 110
+    },
+    {
+      "epoch": 0.08846153846153847,
+      "grad_norm": 6.28125,
+      "learning_rate": 4.981825067499049e-05,
+      "loss": 0.3585,
+      "step": 115
+    },
+    {
+      "epoch": 0.09230769230769231,
+      "grad_norm": 2.921875,
+      "learning_rate": 4.978014558546715e-05,
+      "loss": 0.3465,
+      "step": 120
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 3.25,
+      "learning_rate": 4.973843622020571e-05,
+      "loss": 0.2972,
+      "step": 125
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 4.65625,
+      "learning_rate": 4.9693129326561254e-05,
+      "loss": 0.3853,
+      "step": 130
+    },
+    {
+      "epoch": 0.10384615384615385,
+      "grad_norm": 4.21875,
+      "learning_rate": 4.9644232233863685e-05,
+      "loss": 0.3619,
+      "step": 135
+    },
+    {
+      "epoch": 0.1076923076923077,
+      "grad_norm": 3.921875,
+      "learning_rate": 4.95917528522321e-05,
+      "loss": 0.3283,
+      "step": 140
+    },
+    {
+      "epoch": 0.11153846153846154,
+      "grad_norm": 2.703125,
+      "learning_rate": 4.953569967129513e-05,
+      "loss": 0.368,
+      "step": 145
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 3.375,
+      "learning_rate": 4.947608175881757e-05,
+      "loss": 0.3791,
+      "step": 150
+    },
+    {
+      "epoch": 0.11923076923076924,
+      "grad_norm": 3.65625,
+      "learning_rate": 4.9412908759233535e-05,
+      "loss": 0.3163,
+      "step": 155
+    },
+    {
+      "epoch": 0.12307692307692308,
+      "grad_norm": 5.625,
+      "learning_rate": 4.9346190892086174e-05,
+      "loss": 0.3239,
+      "step": 160
+    },
+    {
+      "epoch": 0.12692307692307692,
+      "grad_norm": 3.484375,
+      "learning_rate": 4.927593895037453e-05,
+      "loss": 0.2991,
+      "step": 165
+    },
+    {
+      "epoch": 0.13076923076923078,
+      "grad_norm": 3.421875,
+      "learning_rate": 4.920216429880752e-05,
+      "loss": 0.3416,
+      "step": 170
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 2.546875,
+      "learning_rate": 4.912487887196544e-05,
+      "loss": 0.3039,
+      "step": 175
+    },
+    {
+      "epoch": 0.13846153846153847,
+      "grad_norm": 3.25,
+      "learning_rate": 4.904409517236934e-05,
+      "loss": 0.2838,
+      "step": 180
+    },
+    {
+      "epoch": 0.1423076923076923,
+      "grad_norm": 3.0625,
+      "learning_rate": 4.895982626845843e-05,
+      "loss": 0.3172,
+      "step": 185
+    },
+    {
+      "epoch": 0.14615384615384616,
+      "grad_norm": 2.546875,
+      "learning_rate": 4.887208579247604e-05,
+      "loss": 0.2793,
+      "step": 190
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 3.8125,
+      "learning_rate": 4.878088793826428e-05,
+      "loss": 0.3016,
+      "step": 195
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 2.984375,
+      "learning_rate": 4.868624745896793e-05,
+      "loss": 0.2968,
+      "step": 200
+    },
+    {
+      "epoch": 0.1576923076923077,
+      "grad_norm": 3.75,
+      "learning_rate": 4.858817966464777e-05,
+      "loss": 0.295,
+      "step": 205
+    },
+    {
+      "epoch": 0.16153846153846155,
+      "grad_norm": 2.0625,
+      "learning_rate": 4.848670041980392e-05,
+      "loss": 0.3002,
+      "step": 210
+    },
+    {
+      "epoch": 0.16538461538461538,
+      "grad_norm": 2.671875,
+      "learning_rate": 4.838182614080934e-05,
+      "loss": 0.3005,
+      "step": 215
+    },
+    {
+      "epoch": 0.16923076923076924,
+      "grad_norm": 2.21875,
+      "learning_rate": 4.827357379325422e-05,
+      "loss": 0.2546,
+      "step": 220
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 2.140625,
+      "learning_rate": 4.816196088920143e-05,
+      "loss": 0.3147,
+      "step": 225
+    },
+    {
+      "epoch": 0.17692307692307693,
+      "grad_norm": 2.34375,
+      "learning_rate": 4.804700548435353e-05,
+      "loss": 0.276,
+      "step": 230
+    },
+    {
+      "epoch": 0.18076923076923077,
+      "grad_norm": 2.828125,
+      "learning_rate": 4.7928726175131955e-05,
+      "loss": 0.3404,
+      "step": 235
+    },
+    {
+      "epoch": 0.18461538461538463,
+      "grad_norm": 3.65625,
+      "learning_rate": 4.780714209566861e-05,
+      "loss": 0.3394,
+      "step": 240
+    },
+    {
+      "epoch": 0.18846153846153846,
+      "grad_norm": 2.3125,
+      "learning_rate": 4.768227291471055e-05,
+      "loss": 0.2997,
+      "step": 245
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 2.765625,
+      "learning_rate": 4.755413883243812e-05,
+      "loss": 0.2843,
+      "step": 250
+    },
+    {
+      "epoch": 0.19615384615384615,
+      "grad_norm": 2.828125,
+      "learning_rate": 4.7422760577197226e-05,
+      "loss": 0.2963,
+      "step": 255
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.765625,
+      "learning_rate": 4.7288159402146e-05,
+      "loss": 0.2854,
+      "step": 260
+    },
+    {
+      "epoch": 0.20384615384615384,
+      "grad_norm": 2.859375,
+      "learning_rate": 4.715035708181676e-05,
+      "loss": 0.3067,
+      "step": 265
+    },
+    {
+      "epoch": 0.2076923076923077,
+      "grad_norm": 2.6875,
+      "learning_rate": 4.7009375908593434e-05,
+      "loss": 0.2886,
+      "step": 270
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 2.765625,
+      "learning_rate": 4.6865238689105386e-05,
+      "loss": 0.3067,
+      "step": 275
+    },
+    {
+      "epoch": 0.2153846153846154,
+      "grad_norm": 3.171875,
+      "learning_rate": 4.67179687405379e-05,
+      "loss": 0.3321,
+      "step": 280
+    },
+    {
+      "epoch": 0.21923076923076923,
+      "grad_norm": 2.515625,
+      "learning_rate": 4.65675898868602e-05,
+      "loss": 0.283,
+      "step": 285
+    },
+    {
+      "epoch": 0.2230769230769231,
+      "grad_norm": 1.8203125,
+      "learning_rate": 4.6414126454971406e-05,
+      "loss": 0.2626,
+      "step": 290
+    },
+    {
+      "epoch": 0.22692307692307692,
+      "grad_norm": 2.171875,
+      "learning_rate": 4.625760327076513e-05,
+      "loss": 0.2789,
+      "step": 295
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 2.65625,
+      "learning_rate": 4.6098045655113416e-05,
+      "loss": 0.3019,
+      "step": 300
+    },
+    {
+      "epoch": 0.23461538461538461,
+      "grad_norm": 1.765625,
+      "learning_rate": 4.5935479419770525e-05,
+      "loss": 0.2696,
+      "step": 305
+    },
+    {
+      "epoch": 0.23846153846153847,
+      "grad_norm": 2.8125,
+      "learning_rate": 4.576993086319737e-05,
+      "loss": 0.2759,
+      "step": 310
+    },
+    {
+      "epoch": 0.2423076923076923,
+      "grad_norm": 2.671875,
+      "learning_rate": 4.56014267663072e-05,
+      "loss": 0.2688,
+      "step": 315
+    },
+    {
+      "epoch": 0.24615384615384617,
+      "grad_norm": 1.953125,
+      "learning_rate": 4.542999438813322e-05,
+      "loss": 0.2696,
+      "step": 320
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.40625,
+      "learning_rate": 4.5255661461418854e-05,
+      "loss": 0.2571,
+      "step": 325
+    },
+    {
+      "epoch": 0.25384615384615383,
+      "grad_norm": 1.9296875,
+      "learning_rate": 4.5078456188131464e-05,
+      "loss": 0.2871,
+      "step": 330
+    },
+    {
+      "epoch": 0.25769230769230766,
+      "grad_norm": 1.9453125,
+      "learning_rate": 4.4898407234900006e-05,
+      "loss": 0.2977,
+      "step": 335
+    },
+    {
+      "epoch": 0.26153846153846155,
+      "grad_norm": 1.8203125,
+      "learning_rate": 4.4715543728377706e-05,
+      "loss": 0.2467,
+      "step": 340
+    },
+    {
+      "epoch": 0.2653846153846154,
+      "grad_norm": 2.703125,
+      "learning_rate": 4.452989525053014e-05,
+      "loss": 0.2563,
+      "step": 345
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 2.46875,
+      "learning_rate": 4.434149183384977e-05,
+      "loss": 0.31,
+      "step": 350
+    },
+    {
+      "epoch": 0.27307692307692305,
+      "grad_norm": 2.078125,
+      "learning_rate": 4.41503639564976e-05,
+      "loss": 0.25,
+      "step": 355
+    },
+    {
+      "epoch": 0.27692307692307694,
+      "grad_norm": 2.453125,
+      "learning_rate": 4.3956542537372596e-05,
+      "loss": 0.2497,
+      "step": 360
+    },
+    {
+      "epoch": 0.28076923076923077,
+      "grad_norm": 2.53125,
+      "learning_rate": 4.3760058931110054e-05,
+      "loss": 0.2672,
+      "step": 365
+    },
+    {
+      "epoch": 0.2846153846153846,
+      "grad_norm": 1.9921875,
+      "learning_rate": 4.356094492300922e-05,
+      "loss": 0.2678,
+      "step": 370
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 3.421875,
+      "learning_rate": 4.3359232723891423e-05,
+      "loss": 0.2926,
+      "step": 375
+    },
+    {
+      "epoch": 0.2923076923076923,
+      "grad_norm": 2.40625,
+      "learning_rate": 4.315495496488927e-05,
+      "loss": 0.3105,
+      "step": 380
+    },
+    {
+      "epoch": 0.29615384615384616,
+      "grad_norm": 2.171875,
+      "learning_rate": 4.294814469216794e-05,
+      "loss": 0.2627,
+      "step": 385
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.96875,
+      "learning_rate": 4.2738835361579175e-05,
+      "loss": 0.257,
+      "step": 390
+    },
+    {
+      "epoch": 0.3038461538461538,
+      "grad_norm": 3.78125,
+      "learning_rate": 4.252706083324923e-05,
+      "loss": 0.3106,
+      "step": 395
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 2.859375,
+      "learning_rate": 4.2312855366101216e-05,
+      "loss": 0.2824,
+      "step": 400
+    },
+    {
+      "epoch": 0.31153846153846154,
+      "grad_norm": 3.359375,
+      "learning_rate": 4.2096253612313006e-05,
+      "loss": 0.2902,
+      "step": 405
+    },
+    {
+      "epoch": 0.3153846153846154,
+      "grad_norm": 2.328125,
+      "learning_rate": 4.187729061171158e-05,
+      "loss": 0.263,
+      "step": 410
+    },
+    {
+      "epoch": 0.3192307692307692,
+      "grad_norm": 1.59375,
+      "learning_rate": 4.165600178610454e-05,
+      "loss": 0.2842,
+      "step": 415
+    },
+    {
+      "epoch": 0.3230769230769231,
+      "grad_norm": 2.5625,
+      "learning_rate": 4.143242293354995e-05,
+      "loss": 0.2506,
+      "step": 420
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 2.15625,
+      "learning_rate": 4.120659022256523e-05,
+      "loss": 0.3082,
+      "step": 425
+    },
+    {
+      "epoch": 0.33076923076923076,
+      "grad_norm": 2.09375,
+      "learning_rate": 4.097854018627616e-05,
+      "loss": 0.3015,
+      "step": 430
+    },
+    {
+      "epoch": 0.3346153846153846,
+      "grad_norm": 2.765625,
+      "learning_rate": 4.074830971650691e-05,
+      "loss": 0.2668,
+      "step": 435
+    },
+    {
+      "epoch": 0.3384615384615385,
+      "grad_norm": 3.0,
+      "learning_rate": 4.0515936057811924e-05,
+      "loss": 0.2663,
+      "step": 440
+    },
+    {
+      "epoch": 0.3423076923076923,
+      "grad_norm": 1.40625,
+      "learning_rate": 4.0281456801451004e-05,
+      "loss": 0.2735,
+      "step": 445
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 2.25,
+      "learning_rate": 4.0044909879308004e-05,
+      "loss": 0.2879,
+      "step": 450
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.265625,
+      "learning_rate": 3.980633355775461e-05,
+      "loss": 0.2408,
+      "step": 455
+    },
+    {
+      "epoch": 0.35384615384615387,
+      "grad_norm": 2.375,
+      "learning_rate": 3.956576643145997e-05,
+      "loss": 0.2576,
+      "step": 460
+    },
+    {
+      "epoch": 0.3576923076923077,
+      "grad_norm": 2.3125,
+      "learning_rate": 3.9323247417147204e-05,
+      "loss": 0.263,
+      "step": 465
+    },
+    {
+      "epoch": 0.36153846153846153,
+      "grad_norm": 1.796875,
+      "learning_rate": 3.907881574729781e-05,
+      "loss": 0.2473,
+      "step": 470
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 1.90625,
+      "learning_rate": 3.8832510963804976e-05,
+      "loss": 0.2811,
+      "step": 475
+    },
+    {
+      "epoch": 0.36923076923076925,
+      "grad_norm": 2.671875,
+      "learning_rate": 3.858437291157689e-05,
+      "loss": 0.3075,
+      "step": 480
+    },
+    {
+      "epoch": 0.3730769230769231,
+      "grad_norm": 1.9921875,
+      "learning_rate": 3.833444173209102e-05,
+      "loss": 0.2493,
+      "step": 485
+    },
+    {
+      "epoch": 0.3769230769230769,
+      "grad_norm": 1.9453125,
+      "learning_rate": 3.80827578569003e-05,
+      "loss": 0.2414,
+      "step": 490
+    },
+    {
+      "epoch": 0.38076923076923075,
+      "grad_norm": 1.65625,
+      "learning_rate": 3.782936200109261e-05,
+      "loss": 0.2632,
+      "step": 495
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 1.8671875,
+      "learning_rate": 3.7574295156704175e-05,
+      "loss": 0.2756,
+      "step": 500
+    },
+    {
+      "epoch": 0.38846153846153847,
+      "grad_norm": 2.015625,
+      "learning_rate": 3.731759858608832e-05,
+      "loss": 0.2684,
+      "step": 505
+    },
+    {
+      "epoch": 0.3923076923076923,
+      "grad_norm": 2.375,
+      "learning_rate": 3.705931381524034e-05,
+      "loss": 0.2528,
+      "step": 510
+    },
+    {
+      "epoch": 0.39615384615384613,
+      "grad_norm": 1.6640625,
+      "learning_rate": 3.679948262707993e-05,
+      "loss": 0.2669,
+      "step": 515
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.078125,
+      "learning_rate": 3.6538147054691817e-05,
+      "loss": 0.2469,
+      "step": 520
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 2.28125,
+      "learning_rate": 3.62753493745261e-05,
+      "loss": 0.251,
+      "step": 525
+    },
+    {
+      "epoch": 0.4076923076923077,
+      "grad_norm": 1.9375,
+      "learning_rate": 3.601113209955916e-05,
+      "loss": 0.2568,
+      "step": 530
+    },
+    {
+      "epoch": 0.4115384615384615,
+      "grad_norm": 2.015625,
+      "learning_rate": 3.574553797241625e-05,
+      "loss": 0.2698,
+      "step": 535
+    },
+    {
+      "epoch": 0.4153846153846154,
+      "grad_norm": 2.109375,
+      "learning_rate": 3.5478609958457056e-05,
+      "loss": 0.2517,
+      "step": 540
+    },
+    {
+      "epoch": 0.41923076923076924,
+      "grad_norm": 2.890625,
+      "learning_rate": 3.52103912388251e-05,
+      "loss": 0.2793,
+      "step": 545
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 2.5,
+      "learning_rate": 3.4940925203462347e-05,
+      "loss": 0.244,
+      "step": 550
+    },
+    {
+      "epoch": 0.4269230769230769,
+      "grad_norm": 1.953125,
+      "learning_rate": 3.467025544408994e-05,
+      "loss": 0.2511,
+      "step": 555
+    },
+    {
+      "epoch": 0.4307692307692308,
+      "grad_norm": 3.15625,
+      "learning_rate": 3.4398425747156346e-05,
+      "loss": 0.2579,
+      "step": 560
+    },
+    {
+      "epoch": 0.4346153846153846,
+      "grad_norm": 1.921875,
+      "learning_rate": 3.412548008675403e-05,
+      "loss": 0.2595,
+      "step": 565
+    },
+    {
+      "epoch": 0.43846153846153846,
+      "grad_norm": 1.9296875,
+      "learning_rate": 3.3851462617505655e-05,
+      "loss": 0.2263,
+      "step": 570
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 1.671875,
+      "learning_rate": 3.357641766742121e-05,
+      "loss": 0.2641,
+      "step": 575
+    },
+    {
+      "epoch": 0.4461538461538462,
+      "grad_norm": 1.5390625,
+      "learning_rate": 3.3300389730727064e-05,
+      "loss": 0.2693,
+      "step": 580
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.4765625,
+      "learning_rate": 3.3023423460667985e-05,
+      "loss": 0.2478,
+      "step": 585
+    },
+    {
+      "epoch": 0.45384615384615384,
+      "grad_norm": 1.890625,
+      "learning_rate": 3.27455636622837e-05,
+      "loss": 0.24,
+      "step": 590
+    },
+    {
+      "epoch": 0.4576923076923077,
+      "grad_norm": 2.078125,
+      "learning_rate": 3.2466855285160644e-05,
+      "loss": 0.282,
+      "step": 595
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 2.4375,
+      "learning_rate": 3.218734341616047e-05,
+      "loss": 0.2595,
+      "step": 600
+    },
+    {
+      "epoch": 0.4653846153846154,
+      "grad_norm": 1.859375,
+      "learning_rate": 3.1907073272126306e-05,
+      "loss": 0.2573,
+      "step": 605
+    },
+    {
+      "epoch": 0.46923076923076923,
+      "grad_norm": 2.296875,
+      "learning_rate": 3.162609019256802e-05,
+      "loss": 0.2668,
+      "step": 610
+    },
+    {
+      "epoch": 0.47307692307692306,
+      "grad_norm": 1.6640625,
+      "learning_rate": 3.134443963232755e-05,
+      "loss": 0.2264,
+      "step": 615
+    },
+    {
+      "epoch": 0.47692307692307695,
+      "grad_norm": 2.125,
+      "learning_rate": 3.1062167154225725e-05,
+      "loss": 0.259,
+      "step": 620
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 2.625,
+      "learning_rate": 3.0779318421691464e-05,
+      "loss": 0.2848,
+      "step": 625
+    },
+    {
+      "epoch": 0.4846153846153846,
+      "grad_norm": 1.9921875,
+      "learning_rate": 3.0495939191374796e-05,
+      "loss": 0.2425,
+      "step": 630
+    },
+    {
+      "epoch": 0.48846153846153845,
+      "grad_norm": 1.6171875,
+      "learning_rate": 3.021207530574477e-05,
+      "loss": 0.2488,
+      "step": 635
+    },
+    {
+      "epoch": 0.49230769230769234,
+      "grad_norm": 2.015625,
+      "learning_rate": 2.9927772685673505e-05,
+      "loss": 0.2589,
+      "step": 640
+    },
+    {
+      "epoch": 0.49615384615384617,
+      "grad_norm": 1.9921875,
+      "learning_rate": 2.9643077323007484e-05,
+      "loss": 0.249,
+      "step": 645
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.75,
+      "learning_rate": 2.9358035273127483e-05,
+      "loss": 0.209,
+      "step": 650
+    },
+    {
+      "epoch": 0.5038461538461538,
+      "grad_norm": 1.828125,
+      "learning_rate": 2.9072692647498106e-05,
+      "loss": 0.2399,
+      "step": 655
+    },
+    {
+      "epoch": 0.5076923076923077,
+      "grad_norm": 1.4375,
+      "learning_rate": 2.8787095606208335e-05,
+      "loss": 0.1993,
+      "step": 660
+    },
+    {
+      "epoch": 0.5115384615384615,
+      "grad_norm": 1.9453125,
+      "learning_rate": 2.8501290350504162e-05,
+      "loss": 0.2196,
+      "step": 665
+    },
+    {
+      "epoch": 0.5153846153846153,
+      "grad_norm": 1.921875,
+      "learning_rate": 2.821532311531455e-05,
+      "loss": 0.2367,
+      "step": 670
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 1.8046875,
+      "learning_rate": 2.792924016177205e-05,
+      "loss": 0.2782,
+      "step": 675
+    },
+    {
+      "epoch": 0.5230769230769231,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.764308776972903e-05,
+      "loss": 0.2169,
+      "step": 680
+    },
+    {
+      "epoch": 0.5269230769230769,
+      "grad_norm": 1.921875,
+      "learning_rate": 2.735691223027098e-05,
+      "loss": 0.222,
+      "step": 685
+    },
+    {
+      "epoch": 0.5307692307692308,
+      "grad_norm": 1.984375,
+      "learning_rate": 2.7070759838227955e-05,
+      "loss": 0.2558,
+      "step": 690
+    },
+    {
+      "epoch": 0.5346153846153846,
+      "grad_norm": 2.03125,
+      "learning_rate": 2.678467688468545e-05,
+      "loss": 0.2375,
+      "step": 695
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 2.953125,
+      "learning_rate": 2.649870964949585e-05,
+      "loss": 0.2702,
+      "step": 700
+    },
+    {
+      "epoch": 0.5423076923076923,
+      "grad_norm": 1.4296875,
+      "learning_rate": 2.621290439379167e-05,
+      "loss": 0.2263,
+      "step": 705
+    },
+    {
+      "epoch": 0.5461538461538461,
+      "grad_norm": 2.484375,
+      "learning_rate": 2.5927307352501896e-05,
+      "loss": 0.2527,
+      "step": 710
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.296875,
+      "learning_rate": 2.5641964726872526e-05,
+      "loss": 0.1968,
+      "step": 715
+    },
+    {
+      "epoch": 0.5538461538461539,
+      "grad_norm": 2.015625,
+      "learning_rate": 2.5356922676992524e-05,
+      "loss": 0.2424,
+      "step": 720
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 1.9296875,
+      "learning_rate": 2.5072227314326508e-05,
+      "loss": 0.2826,
+      "step": 725
+    },
+    {
+      "epoch": 0.5615384615384615,
+      "grad_norm": 1.796875,
+      "learning_rate": 2.4787924694255237e-05,
+      "loss": 0.2383,
+      "step": 730
+    },
+    {
+      "epoch": 0.5653846153846154,
+      "grad_norm": 1.671875,
+      "learning_rate": 2.4504060808625213e-05,
+      "loss": 0.2624,
+      "step": 735
+    },
+    {
+      "epoch": 0.5692307692307692,
+      "grad_norm": 1.8125,
+      "learning_rate": 2.422068157830854e-05,
+      "loss": 0.2473,
+      "step": 740
+    },
+    {
+      "epoch": 0.573076923076923,
+      "grad_norm": 2.515625,
+      "learning_rate": 2.3937832845774277e-05,
+      "loss": 0.2391,
+      "step": 745
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 1.7265625,
+      "learning_rate": 2.3655560367672454e-05,
+      "loss": 0.251,
+      "step": 750
+    },
+    {
+      "epoch": 0.5807692307692308,
+      "grad_norm": 1.5390625,
+      "learning_rate": 2.3373909807431993e-05,
+      "loss": 0.2025,
+      "step": 755
+    },
+    {
+      "epoch": 0.5846153846153846,
+      "grad_norm": 1.6015625,
+      "learning_rate": 2.3092926727873702e-05,
+      "loss": 0.2399,
+      "step": 760
+    },
+    {
+      "epoch": 0.5884615384615385,
+      "grad_norm": 1.9375,
+      "learning_rate": 2.2812656583839538e-05,
+      "loss": 0.2689,
+      "step": 765
+    },
+    {
+      "epoch": 0.5923076923076923,
+      "grad_norm": 1.8515625,
+      "learning_rate": 2.253314471483937e-05,
+      "loss": 0.2576,
+      "step": 770
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 1.765625,
+      "learning_rate": 2.2254436337716305e-05,
+      "loss": 0.2349,
+      "step": 775
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.421875,
+      "learning_rate": 2.1976576539332024e-05,
+      "loss": 0.2436,
+      "step": 780
+    },
+    {
+      "epoch": 0.6038461538461538,
+      "grad_norm": 1.890625,
+      "learning_rate": 2.1699610269272952e-05,
+      "loss": 0.2533,
+      "step": 785
+    },
+    {
+      "epoch": 0.6076923076923076,
+      "grad_norm": 1.515625,
+      "learning_rate": 2.1423582332578796e-05,
+      "loss": 0.2787,
+      "step": 790
+    },
+    {
+      "epoch": 0.6115384615384616,
+      "grad_norm": 1.703125,
+      "learning_rate": 2.1148537382494353e-05,
+      "loss": 0.2291,
+      "step": 795
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 1.515625,
+      "learning_rate": 2.0874519913245977e-05,
+      "loss": 0.2548,
+      "step": 800
+    },
+    {
+      "epoch": 0.6192307692307693,
+      "grad_norm": 1.703125,
+      "learning_rate": 2.060157425284366e-05,
+      "loss": 0.2128,
+      "step": 805
+    },
+    {
+      "epoch": 0.6230769230769231,
+      "grad_norm": 1.8046875,
+      "learning_rate": 2.0329744555910075e-05,
+      "loss": 0.2442,
+      "step": 810
+    },
+    {
+      "epoch": 0.6269230769230769,
+      "grad_norm": 1.5703125,
+      "learning_rate": 2.0059074796537662e-05,
+      "loss": 0.2584,
+      "step": 815
+    },
+    {
+      "epoch": 0.6307692307692307,
+      "grad_norm": 1.9375,
+      "learning_rate": 1.978960876117491e-05,
+      "loss": 0.2312,
+      "step": 820
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 2.59375,
+      "learning_rate": 1.9521390041542946e-05,
+      "loss": 0.2475,
+      "step": 825
+    },
+    {
+      "epoch": 0.6384615384615384,
+      "grad_norm": 2.203125,
+      "learning_rate": 1.9254462027583753e-05,
+      "loss": 0.2416,
+      "step": 830
+    },
+    {
+      "epoch": 0.6423076923076924,
+      "grad_norm": 1.578125,
+      "learning_rate": 1.8988867900440845e-05,
+      "loss": 0.2377,
+      "step": 835
+    },
+    {
+      "epoch": 0.6461538461538462,
+      "grad_norm": 1.796875,
+      "learning_rate": 1.87246506254739e-05,
+      "loss": 0.2644,
+      "step": 840
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.71875,
+      "learning_rate": 1.8461852945308196e-05,
+      "loss": 0.2375,
+      "step": 845
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 1.875,
+      "learning_rate": 1.8200517372920078e-05,
+      "loss": 0.2666,
+      "step": 850
+    },
+    {
+      "epoch": 0.6576923076923077,
+      "grad_norm": 1.8671875,
+      "learning_rate": 1.794068618475967e-05,
+      "loss": 0.2515,
+      "step": 855
+    },
+    {
+      "epoch": 0.6615384615384615,
+      "grad_norm": 1.5703125,
+      "learning_rate": 1.7682401413911697e-05,
+      "loss": 0.2091,
+      "step": 860
+    },
+    {
+      "epoch": 0.6653846153846154,
+      "grad_norm": 1.8203125,
+      "learning_rate": 1.742570484329583e-05,
+      "loss": 0.2332,
+      "step": 865
+    },
+    {
+      "epoch": 0.6692307692307692,
+      "grad_norm": 1.5859375,
+      "learning_rate": 1.7170637998907397e-05,
+      "loss": 0.2388,
+      "step": 870
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 1.8515625,
+      "learning_rate": 1.6917242143099704e-05,
+      "loss": 0.2565,
+      "step": 875
+    },
+    {
+      "epoch": 0.676923076923077,
+      "grad_norm": 1.984375,
+      "learning_rate": 1.6665558267908986e-05,
+      "loss": 0.2539,
+      "step": 880
+    },
+    {
+      "epoch": 0.6807692307692308,
+      "grad_norm": 1.8359375,
+      "learning_rate": 1.64156270884231e-05,
+      "loss": 0.2241,
+      "step": 885
+    },
+    {
+      "epoch": 0.6846153846153846,
+      "grad_norm": 1.625,
+      "learning_rate": 1.616748903619504e-05,
+      "loss": 0.2219,
+      "step": 890
+    },
+    {
+      "epoch": 0.6884615384615385,
+      "grad_norm": 1.375,
+      "learning_rate": 1.5921184252702192e-05,
+      "loss": 0.1955,
+      "step": 895
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 2.3125,
+      "learning_rate": 1.5676752582852794e-05,
+      "loss": 0.2408,
+      "step": 900
+    },
+    {
+      "epoch": 0.6961538461538461,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.543423356854004e-05,
+      "loss": 0.2302,
+      "step": 905
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.5193666442245402e-05,
+      "loss": 0.2266,
+      "step": 910
+    },
+    {
+      "epoch": 0.7038461538461539,
+      "grad_norm": 1.75,
+      "learning_rate": 1.4955090120692006e-05,
+      "loss": 0.2251,
+      "step": 915
+    },
+    {
+      "epoch": 0.7076923076923077,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.4718543198548998e-05,
+      "loss": 0.2318,
+      "step": 920
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 1.6640625,
+      "learning_rate": 1.4484063942188076e-05,
+      "loss": 0.249,
+      "step": 925
+    },
+    {
+      "epoch": 0.7153846153846154,
+      "grad_norm": 2.109375,
+      "learning_rate": 1.4251690283493105e-05,
+      "loss": 0.2817,
+      "step": 930
+    },
+    {
+      "epoch": 0.7192307692307692,
+      "grad_norm": 2.609375,
+      "learning_rate": 1.402145981372384e-05,
+      "loss": 0.2437,
+      "step": 935
+    },
+    {
+      "epoch": 0.7230769230769231,
+      "grad_norm": 2.078125,
+      "learning_rate": 1.3793409777434775e-05,
+      "loss": 0.2358,
+      "step": 940
+    },
+    {
+      "epoch": 0.7269230769230769,
+      "grad_norm": 1.6328125,
+      "learning_rate": 1.3567577066450066e-05,
+      "loss": 0.2126,
+      "step": 945
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.3343998213895464e-05,
+      "loss": 0.2248,
+      "step": 950
+    },
+    {
+      "epoch": 0.7346153846153847,
+      "grad_norm": 1.7421875,
+      "learning_rate": 1.3122709388288423e-05,
+      "loss": 0.2528,
+      "step": 955
+    },
+    {
+      "epoch": 0.7384615384615385,
+      "grad_norm": 1.703125,
+      "learning_rate": 1.2903746387686994e-05,
+      "loss": 0.2311,
+      "step": 960
+    },
+    {
+      "epoch": 0.7423076923076923,
+      "grad_norm": 1.734375,
+      "learning_rate": 1.268714463389879e-05,
+      "loss": 0.2829,
+      "step": 965
+    },
+    {
+      "epoch": 0.7461538461538462,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.2472939166750766e-05,
+      "loss": 0.2245,
+      "step": 970
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.71875,
+      "learning_rate": 1.2261164638420832e-05,
+      "loss": 0.2457,
+      "step": 975
+    },
+    {
+      "epoch": 0.7538461538461538,
+      "grad_norm": 1.9765625,
+      "learning_rate": 1.2051855307832074e-05,
+      "loss": 0.2374,
+      "step": 980
+    },
+    {
+      "epoch": 0.7576923076923077,
+      "grad_norm": 1.7734375,
+      "learning_rate": 1.184504503511073e-05,
+      "loss": 0.2299,
+      "step": 985
+    },
+    {
+      "epoch": 0.7615384615384615,
+      "grad_norm": 1.8515625,
+      "learning_rate": 1.1640767276108584e-05,
+      "loss": 0.2576,
+      "step": 990
+    },
+    {
+      "epoch": 0.7653846153846153,
+      "grad_norm": 1.6640625,
+      "learning_rate": 1.1439055076990782e-05,
+      "loss": 0.2014,
+      "step": 995
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 1.4375,
+      "learning_rate": 1.123994106888995e-05,
+      "loss": 0.2618,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7730769230769231,
+      "grad_norm": 1.7890625,
+      "learning_rate": 1.1043457462627404e-05,
+      "loss": 0.2112,
+      "step": 1005
+    },
+    {
+      "epoch": 0.7769230769230769,
+      "grad_norm": 1.5,
+      "learning_rate": 1.084963604350241e-05,
+      "loss": 0.236,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7807692307692308,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.0658508166150224e-05,
+      "loss": 0.221,
+      "step": 1015
+    },
+    {
+      "epoch": 0.7846153846153846,
+      "grad_norm": 2.15625,
+      "learning_rate": 1.0470104749469867e-05,
+      "loss": 0.2579,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 1.84375,
+      "learning_rate": 1.0284456271622295e-05,
+      "loss": 0.2139,
+      "step": 1025
+    },
+    {
+      "epoch": 0.7923076923076923,
+      "grad_norm": 1.8046875,
+      "learning_rate": 1.010159276509999e-05,
+      "loss": 0.2461,
+      "step": 1030
+    },
+    {
+      "epoch": 0.7961538461538461,
+      "grad_norm": 1.5625,
+      "learning_rate": 9.921543811868548e-06,
+      "loss": 0.2538,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.65625,
+      "learning_rate": 9.744338538581147e-06,
+      "loss": 0.2173,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8038461538461539,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.570005611866787e-06,
+      "loss": 0.2277,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.398573233692802e-06,
+      "loss": 0.2432,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8115384615384615,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.230069136802633e-06,
+      "loss": 0.2622,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8153846153846154,
+      "grad_norm": 1.890625,
+      "learning_rate": 9.064520580229482e-06,
+      "loss": 0.2528,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8192307692307692,
+      "grad_norm": 1.65625,
+      "learning_rate": 8.90195434488659e-06,
+      "loss": 0.2785,
+      "step": 1065
+    },
+    {
+      "epoch": 0.823076923076923,
+      "grad_norm": 1.6484375,
+      "learning_rate": 8.742396729234873e-06,
+      "loss": 0.2489,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 1.71875,
+      "learning_rate": 8.5858735450286e-06,
+      "loss": 0.2395,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8307692307692308,
+      "grad_norm": 2.578125,
+      "learning_rate": 8.432410113139803e-06,
+      "loss": 0.2466,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8346153846153846,
+      "grad_norm": 2.0625,
+      "learning_rate": 8.2820312594621e-06,
+      "loss": 0.261,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8384615384615385,
+      "grad_norm": 1.6875,
+      "learning_rate": 8.134761310894623e-06,
+      "loss": 0.2618,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8423076923076923,
+      "grad_norm": 1.7109375,
+      "learning_rate": 7.990624091406568e-06,
+      "loss": 0.2358,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 1.40625,
+      "learning_rate": 7.849642918183244e-06,
+      "loss": 0.2252,
+      "step": 1100
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.640625,
+      "learning_rate": 7.711840597853998e-06,
+      "loss": 0.25,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8538461538461538,
+      "grad_norm": 1.546875,
+      "learning_rate": 7.57723942280278e-06,
+      "loss": 0.2206,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8576923076923076,
+      "grad_norm": 1.3359375,
+      "learning_rate": 7.44586116756188e-06,
+      "loss": 0.2147,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8615384615384616,
+      "grad_norm": 1.3671875,
+      "learning_rate": 7.317727085289458e-06,
+      "loss": 0.2299,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 1.4453125,
+      "learning_rate": 7.192857904331393e-06,
+      "loss": 0.2149,
+      "step": 1125
+    },
+    {
+      "epoch": 0.8692307692307693,
+      "grad_norm": 1.609375,
+      "learning_rate": 7.071273824868049e-06,
+      "loss": 0.2584,
+      "step": 1130
+    },
+    {
+      "epoch": 0.8730769230769231,
+      "grad_norm": 1.1953125,
+      "learning_rate": 6.95299451564648e-06,
+      "loss": 0.2256,
+      "step": 1135
+    },
+    {
+      "epoch": 0.8769230769230769,
+      "grad_norm": 1.6484375,
+      "learning_rate": 6.838039110798579e-06,
+      "loss": 0.2801,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8807692307692307,
+      "grad_norm": 1.796875,
+      "learning_rate": 6.726426206745781e-06,
+      "loss": 0.2297,
+      "step": 1145
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.6181738591906685e-06,
+      "loss": 0.2127,
+      "step": 1150
+    },
+    {
+      "epoch": 0.8884615384615384,
+      "grad_norm": 1.8515625,
+      "learning_rate": 6.513299580196086e-06,
+      "loss": 0.2394,
+      "step": 1155
+    },
+    {
+      "epoch": 0.8923076923076924,
+      "grad_norm": 1.5234375,
+      "learning_rate": 6.411820335352222e-06,
+      "loss": 0.2193,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8961538461538462,
+      "grad_norm": 1.78125,
+      "learning_rate": 6.313752541032071e-06,
+      "loss": 0.233,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 2.296875,
+      "learning_rate": 6.219112061735721e-06,
+      "loss": 0.2243,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 2.0625,
+      "learning_rate": 6.127914207523965e-06,
+      "loss": 0.231,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9076923076923077,
+      "grad_norm": 1.2421875,
+      "learning_rate": 6.040173731541575e-06,
+      "loss": 0.1955,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9115384615384615,
+      "grad_norm": 1.6328125,
+      "learning_rate": 5.955904827630666e-06,
+      "loss": 0.2635,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9153846153846154,
+      "grad_norm": 1.5625,
+      "learning_rate": 5.87512112803456e-06,
+      "loss": 0.2445,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9192307692307692,
+      "grad_norm": 2.59375,
+      "learning_rate": 5.797835701192484e-06,
+      "loss": 0.23,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 1.7109375,
+      "learning_rate": 5.7240610496254724e-06,
+      "loss": 0.2614,
+      "step": 1200
+    },
+    {
+      "epoch": 0.926923076923077,
+      "grad_norm": 1.65625,
+      "learning_rate": 5.65380910791383e-06,
+      "loss": 0.251,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9307692307692308,
+      "grad_norm": 1.3046875,
+      "learning_rate": 5.5870912407664715e-06,
+      "loss": 0.2453,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9346153846153846,
+      "grad_norm": 1.9453125,
+      "learning_rate": 5.52391824118243e-06,
+      "loss": 0.2123,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9384615384615385,
+      "grad_norm": 1.4296875,
+      "learning_rate": 5.464300328704876e-06,
+      "loss": 0.2387,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 1.625,
+      "learning_rate": 5.408247147767904e-06,
+      "loss": 0.2374,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9461538461538461,
+      "grad_norm": 1.9765625,
+      "learning_rate": 5.355767766136315e-06,
+      "loss": 0.229,
+      "step": 1230
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.75,
+      "learning_rate": 5.3068706734387484e-06,
+      "loss": 0.2112,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9538461538461539,
+      "grad_norm": 1.5234375,
+      "learning_rate": 5.261563779794289e-06,
+      "loss": 0.2267,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9576923076923077,
+      "grad_norm": 1.71875,
+      "learning_rate": 5.219854414532854e-06,
+      "loss": 0.2318,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 2.234375,
+      "learning_rate": 5.181749325009511e-06,
+      "loss": 0.2305,
+      "step": 1250
+    },
+    {
+      "epoch": 0.9653846153846154,
+      "grad_norm": 1.8359375,
+      "learning_rate": 5.1472546755129605e-06,
+      "loss": 0.275,
+      "step": 1255
+    },
+    {
+      "epoch": 0.9692307692307692,
+      "grad_norm": 1.5390625,
+      "learning_rate": 5.116376046268333e-06,
+      "loss": 0.2549,
+      "step": 1260
+    },
+    {
+      "epoch": 0.9730769230769231,
+      "grad_norm": 1.3984375,
+      "learning_rate": 5.089118432534466e-06,
+      "loss": 0.2493,
+      "step": 1265
+    },
+    {
+      "epoch": 0.9769230769230769,
+      "grad_norm": 1.8671875,
+      "learning_rate": 5.065486243795829e-06,
+      "loss": 0.263,
+      "step": 1270
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 1.5390625,
+      "learning_rate": 5.045483303049184e-06,
+      "loss": 0.2837,
+      "step": 1275
+    },
+    {
+      "epoch": 0.9846153846153847,
+      "grad_norm": 1.296875,
+      "learning_rate": 5.02911284618515e-06,
+      "loss": 0.1995,
+      "step": 1280
+    },
+    {
+      "epoch": 0.9884615384615385,
+      "grad_norm": 1.4375,
+      "learning_rate": 5.01637752146472e-06,
+      "loss": 0.2232,
+      "step": 1285
+    },
+    {
+      "epoch": 0.9923076923076923,
+      "grad_norm": 2.015625,
+      "learning_rate": 5.007279389090856e-06,
+      "loss": 0.2487,
+      "step": 1290
+    },
+    {
+      "epoch": 0.9961538461538462,
+      "grad_norm": 1.84375,
+      "learning_rate": 5.001819920875201e-06,
+      "loss": 0.2508,
+      "step": 1295
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8359375,
+      "learning_rate": 5e-06,
+      "loss": 0.2403,
+      "step": 1300
+    },
     {
       "epoch": 1.0,
+      "step": 1300,
+      "total_flos": 4.404961752383488e+16,
+      "train_loss": 0.28405343046555154,
+      "train_runtime": 1315.0218,
+      "train_samples_per_second": 0.989,
+      "train_steps_per_second": 0.989
     }
   ],
   "logging_steps": 5,
+  "max_steps": 1300,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 4.404961752383488e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null