Training in progress, step 200, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +710 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9192a84227fca2aac03205fb395d6ce7c4837e98cd36fa369ddb920a8bff5939
 size 373077376

 version https://git-lfs.github.com/spec/v1
+oid sha256:5fbd2e60526489e6f6f39cf288ada5ee34355bdd563beb7e5399d0ac622a5c3e
 size 373077376

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:591d7feb697303f84a724edf4dec4e8afa84368269c2c44266862f6235dde6a9
 size 422377867

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f6e8dff52c08eae75b95c752c0f6f01bfbb1ed09c5bce28a0e4593cda5e5c80
 size 422377867

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18bb6b6e7da6250d22c25b95c64287be4aea598d9d97ef67b0fd69b9be869ed7
 size 15365

 version https://git-lfs.github.com/spec/v1
+oid sha256:ac0e770c61e3acef7bd144efee65a450ed487b244ed293a8dd801394ffcae775
 size 15365

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3cff16289615ba210b601be2162794d1949cb1d62132099a4cf62330c43649a5
 size 15365

 version https://git-lfs.github.com/spec/v1
+oid sha256:21354c9980d6ba769d430eb9962ffba457eb04a66cb90e33b0934d3157fae7cf
 size 15365

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6caa3980c911491b74ec2e96e48a78cd7c3365dd32116c61642a4b6839a2da1
 size 15365

 version https://git-lfs.github.com/spec/v1
+oid sha256:d574a8567b1636a399633d2b20273fdb3be4b888e37f0d331cd695b973f10463
 size 15365

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e93d0d6464a92d227e9235c134b2224cf28ea45ba375cf82e6949c632e2b0d5
 size 15365

 version https://git-lfs.github.com/spec/v1
+oid sha256:cca2b80df90b78b3ba9d9d2951c5b7aa80fc0a44b4512b495de6de40396fcb6e
 size 15365

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8f74ef64bb62eb0db7b90ee83cd7b2ecc127cfca56e27af0bc348a6066ee6ce
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:39a79e2827280868096ff650d0ee6e4723dddc824192c37a937a656d54903350
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.4166666666666667,
   "eval_steps": 100,
-  "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -715,6 +715,713 @@
       "eval_samples_per_second": 17.321,
       "eval_steps_per_second": 1.083,
       "step": 100
     }
   ],
   "logging_steps": 1,
@@ -734,7 +1441,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 5.09447277379584e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.8333333333333334,
   "eval_steps": 100,
+  "global_step": 200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 17.321,
       "eval_steps_per_second": 1.083,
       "step": 100
+    },
+    {
+      "epoch": 0.42083333333333334,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0006753187775963773,
+      "loss": 6.1547,
+      "step": 101
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0006688502846768696,
+      "loss": 6.1169,
+      "step": 102
+    },
+    {
+      "epoch": 0.42916666666666664,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0006623497346023419,
+      "loss": 5.9032,
+      "step": 103
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0006558183615379707,
+      "loss": 5.9968,
+      "step": 104
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0006492574055008473,
+      "loss": 6.0527,
+      "step": 105
+    },
+    {
+      "epoch": 0.44166666666666665,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0006426681121245527,
+      "loss": 5.9248,
+      "step": 106
+    },
+    {
+      "epoch": 0.44583333333333336,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0006360517324226675,
+      "loss": 5.8954,
+      "step": 107
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0006294095225512603,
+      "loss": 5.7513,
+      "step": 108
+    },
+    {
+      "epoch": 0.45416666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0006227427435703996,
+      "loss": 5.9773,
+      "step": 109
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0006160526612047339,
+      "loss": 5.8157,
+      "step": 110
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0006093405456031879,
+      "loss": 5.9297,
+      "step": 111
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0006026076710978171,
+      "loss": 5.9629,
+      "step": 112
+    },
+    {
+      "epoch": 0.4708333333333333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0005958553159618693,
+      "loss": 5.9699,
+      "step": 113
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0005890847621670966,
+      "loss": 5.9486,
+      "step": 114
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000582297295140367,
+      "loss": 5.9959,
+      "step": 115
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0005754942035196184,
+      "loss": 6.011,
+      "step": 116
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0005686767789092041,
+      "loss": 5.7683,
+      "step": 117
+    },
+    {
+      "epoch": 0.49166666666666664,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000561846315634674,
+      "loss": 5.9579,
+      "step": 118
+    },
+    {
+      "epoch": 0.49583333333333335,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0005550041104970397,
+      "loss": 5.8793,
+      "step": 119
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0005481514625265708,
+      "loss": 5.7256,
+      "step": 120
+    },
+    {
+      "epoch": 0.5041666666666667,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0005412896727361663,
+      "loss": 5.8975,
+      "step": 121
+    },
+    {
+      "epoch": 0.5083333333333333,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0005344200438743488,
+      "loss": 5.959,
+      "step": 122
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0005275438801779327,
+      "loss": 5.8133,
+      "step": 123
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0005206624871244065,
+      "loss": 5.8534,
+      "step": 124
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005137771711840811,
+      "loss": 5.8988,
+      "step": 125
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0005068892395720482,
+      "loss": 6.0026,
+      "step": 126
+    },
+    {
+      "epoch": 0.5291666666666667,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0005,
+      "loss": 5.8737,
+      "step": 127
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0004931107604279518,
+      "loss": 5.7772,
+      "step": 128
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0004862228288159191,
+      "loss": 5.8457,
+      "step": 129
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0004793375128755934,
+      "loss": 5.7298,
+      "step": 130
+    },
+    {
+      "epoch": 0.5458333333333333,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0004724561198220672,
+      "loss": 6.0151,
+      "step": 131
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0004655799561256514,
+      "loss": 5.7457,
+      "step": 132
+    },
+    {
+      "epoch": 0.5541666666666667,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0004587103272638339,
+      "loss": 5.7534,
+      "step": 133
+    },
+    {
+      "epoch": 0.5583333333333333,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0004518485374734292,
+      "loss": 5.7571,
+      "step": 134
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0004449958895029604,
+      "loss": 5.7288,
+      "step": 135
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00043815368436532617,
+      "loss": 5.8647,
+      "step": 136
+    },
+    {
+      "epoch": 0.5708333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0004313232210907959,
+      "loss": 5.8594,
+      "step": 137
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00042450579648038153,
+      "loss": 5.7164,
+      "step": 138
+    },
+    {
+      "epoch": 0.5791666666666667,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00041770270485963295,
+      "loss": 5.7606,
+      "step": 139
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0004109152378329036,
+      "loss": 5.7807,
+      "step": 140
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00040414468403813093,
+      "loss": 5.7903,
+      "step": 141
+    },
+    {
+      "epoch": 0.5916666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00039739232890218293,
+      "loss": 5.6957,
+      "step": 142
+    },
+    {
+      "epoch": 0.5958333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00039065945439681213,
+      "loss": 5.6413,
+      "step": 143
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0003839473387952662,
+      "loss": 5.6618,
+      "step": 144
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00037725725642960046,
+      "loss": 5.7087,
+      "step": 145
+    },
+    {
+      "epoch": 0.6083333333333333,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0003705904774487396,
+      "loss": 5.5071,
+      "step": 146
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0003639482675773324,
+      "loss": 5.6133,
+      "step": 147
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00035733188787544744,
+      "loss": 5.5889,
+      "step": 148
+    },
+    {
+      "epoch": 0.6208333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00035074259449915284,
+      "loss": 5.6028,
+      "step": 149
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00034418163846202944,
+      "loss": 5.8408,
+      "step": 150
+    },
+    {
+      "epoch": 0.6291666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0003376502653976583,
+      "loss": 5.8503,
+      "step": 151
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0003311497153231305,
+      "loss": 5.6345,
+      "step": 152
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00032468122240362287,
+      "loss": 5.5636,
+      "step": 153
+    },
+    {
+      "epoch": 0.6416666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.000318246014718085,
+      "loss": 5.7088,
+      "step": 154
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0003118453140260823,
+      "loss": 5.7296,
+      "step": 155
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000305480335535837,
+      "loss": 5.7223,
+      "step": 156
+    },
+    {
+      "epoch": 0.6541666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002991522876735154,
+      "loss": 5.5813,
+      "step": 157
+    },
+    {
+      "epoch": 0.6583333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002928623718538006,
+      "loss": 5.6656,
+      "step": 158
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002866117822517982,
+      "loss": 5.5557,
+      "step": 159
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0002804017055763149,
+      "loss": 5.6746,
+      "step": 160
+    },
+    {
+      "epoch": 0.6708333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00027423332084455543,
+      "loss": 5.4792,
+      "step": 161
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002681077991582797,
+      "loss": 5.6576,
+      "step": 162
+    },
+    {
+      "epoch": 0.6791666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002620263034814632,
+      "loss": 5.6155,
+      "step": 163
+    },
+    {
+      "epoch": 0.6833333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00025598998841950106,
+      "loss": 5.8009,
+      "step": 164
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002500000000000001,
+      "loss": 5.6332,
+      "step": 165
+    },
+    {
+      "epoch": 0.6916666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00024405747545519962,
+      "loss": 5.3542,
+      "step": 166
+    },
+    {
+      "epoch": 0.6958333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00023816354300606107,
+      "loss": 5.6408,
+      "step": 167
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002323193216480698,
+      "loss": 5.6203,
+      "step": 168
+    },
+    {
+      "epoch": 0.7041666666666667,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00022652592093878665,
+      "loss": 5.6601,
+      "step": 169
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002207844407871929,
+      "loss": 5.8144,
+      "step": 170
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002150959712448669,
+      "loss": 5.5789,
+      "step": 171
+    },
+    {
+      "epoch": 0.7166666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002094615922990309,
+      "loss": 5.6727,
+      "step": 172
+    },
+    {
+      "epoch": 0.7208333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00020388237366751006,
+      "loss": 5.4561,
+      "step": 173
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00019835937459564064,
+      "loss": 5.7376,
+      "step": 174
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00019289364365516608,
+      "loss": 5.5645,
+      "step": 175
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00018748621854516078,
+      "loss": 5.4465,
+      "step": 176
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001821381258950161,
+      "loss": 5.6323,
+      "step": 177
+    },
+    {
+      "epoch": 0.7416666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001768503810695295,
+      "loss": 5.7042,
+      "step": 178
+    },
+    {
+      "epoch": 0.7458333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00017162398797613282,
+      "loss": 5.549,
+      "step": 179
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00016645993887429345,
+      "loss": 5.7366,
+      "step": 180
+    },
+    {
+      "epoch": 0.7541666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00016135921418712956,
+      "loss": 5.7234,
+      "step": 181
+    },
+    {
+      "epoch": 0.7583333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001563227823152708,
+      "loss": 5.4976,
+      "step": 182
+    },
+    {
+      "epoch": 0.7625,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001513515994530023,
+      "loss": 5.4723,
+      "step": 183
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00014644660940672628,
+      "loss": 5.5005,
+      "step": 184
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00014160874341577446,
+      "loss": 5.4176,
+      "step": 185
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0001368389199756075,
+      "loss": 5.5741,
+      "step": 186
+    },
+    {
+      "epoch": 0.7791666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001321380446634342,
+      "loss": 5.2794,
+      "step": 187
+    },
+    {
+      "epoch": 0.7833333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001275070099662815,
+      "loss": 5.5068,
+      "step": 188
+    },
+    {
+      "epoch": 0.7875,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00012294669511155192,
+      "loss": 5.614,
+      "step": 189
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00011845796590009682,
+      "loss": 5.6004,
+      "step": 190
+    },
+    {
+      "epoch": 0.7958333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00011404167454183955,
+      "loss": 5.5943,
+      "step": 191
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000109698659493979,
+      "loss": 5.5778,
+      "step": 192
+    },
+    {
+      "epoch": 0.8041666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00010542974530180327,
+      "loss": 5.6804,
+      "step": 193
+    },
+    {
+      "epoch": 0.8083333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00010123574244214551,
+      "loss": 5.6639,
+      "step": 194
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.3046875,
+      "learning_rate": 9.711744716951093e-05,
+      "loss": 5.4121,
+      "step": 195
+    },
+    {
+      "epoch": 0.8166666666666667,
+      "grad_norm": 0.390625,
+      "learning_rate": 9.307564136490254e-05,
+      "loss": 5.4785,
+      "step": 196
+    },
+    {
+      "epoch": 0.8208333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 8.911109238737747e-05,
+      "loss": 5.6047,
+      "step": 197
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 0.23046875,
+      "learning_rate": 8.522455292835934e-05,
+      "loss": 5.6457,
+      "step": 198
+    },
+    {
+      "epoch": 0.8291666666666667,
+      "grad_norm": 0.232421875,
+      "learning_rate": 8.141676086873573e-05,
+      "loss": 5.6667,
+      "step": 199
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.2109375,
+      "learning_rate": 7.768843913876755e-05,
+      "loss": 5.7239,
+      "step": 200
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "eval_runtime": 1.198,
+      "eval_samples_per_second": 13.356,
+      "eval_steps_per_second": 0.835,
+      "step": 200
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.018894554759168e+17,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null