Training in progress, step 159000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae4822495b17518f3a55422be75e8f39fa5ce0b3a594c8d20dc409f424fc55f1
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a748f752d89b80e6b8bbdf3f3863033a42fa4fd7d76ff87d364cfe1c8a3f1531
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c359d76fc12146bca34b3d81bea11d1cbc59763f4362e664b11294254a3637f
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcf0ec0a64c804385e2c0458be72943b2083978ccaa5abb266ef0e69fa674231
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -28132,11 +28132,189 @@
|
|
| 28132 |
"eval_steps_per_second": 15.499,
|
| 28133 |
"num_input_tokens_seen": 92262378048,
|
| 28134 |
"step": 158000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28135 |
}
|
| 28136 |
],
|
| 28137 |
"logging_steps": 50,
|
| 28138 |
"max_steps": 200000,
|
| 28139 |
-
"num_input_tokens_seen":
|
| 28140 |
"num_train_epochs": 5,
|
| 28141 |
"save_steps": 1000,
|
| 28142 |
"stateful_callbacks": {
|
|
@@ -28151,7 +28329,7 @@
|
|
| 28151 |
"attributes": {}
|
| 28152 |
}
|
| 28153 |
},
|
| 28154 |
-
"total_flos": 1.
|
| 28155 |
"train_batch_size": 32,
|
| 28156 |
"trial_name": null,
|
| 28157 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.492654756029168,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 159000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 28132 |
"eval_steps_per_second": 15.499,
|
| 28133 |
"num_input_tokens_seen": 92262378048,
|
| 28134 |
"step": 158000
|
| 28135 |
+
},
|
| 28136 |
+
{
|
| 28137 |
+
"epoch": 3.4717870006239733,
|
| 28138 |
+
"grad_norm": 0.09290427714586258,
|
| 28139 |
+
"learning_rate": 0.0001,
|
| 28140 |
+
"loss": 2.3419,
|
| 28141 |
+
"num_input_tokens_seen": 92314806848,
|
| 28142 |
+
"step": 158050
|
| 28143 |
+
},
|
| 28144 |
+
{
|
| 28145 |
+
"epoch": 3.4728853035400364,
|
| 28146 |
+
"grad_norm": 0.09033751487731934,
|
| 28147 |
+
"learning_rate": 0.0001,
|
| 28148 |
+
"loss": 2.344,
|
| 28149 |
+
"num_input_tokens_seen": 92367235648,
|
| 28150 |
+
"step": 158100
|
| 28151 |
+
},
|
| 28152 |
+
{
|
| 28153 |
+
"epoch": 3.473983606456099,
|
| 28154 |
+
"grad_norm": 0.0893242284655571,
|
| 28155 |
+
"learning_rate": 0.0001,
|
| 28156 |
+
"loss": 2.342,
|
| 28157 |
+
"num_input_tokens_seen": 92419664448,
|
| 28158 |
+
"step": 158150
|
| 28159 |
+
},
|
| 28160 |
+
{
|
| 28161 |
+
"epoch": 3.475081909372162,
|
| 28162 |
+
"grad_norm": 0.09877942502498627,
|
| 28163 |
+
"learning_rate": 0.0001,
|
| 28164 |
+
"loss": 2.3459,
|
| 28165 |
+
"num_input_tokens_seen": 92472093248,
|
| 28166 |
+
"step": 158200
|
| 28167 |
+
},
|
| 28168 |
+
{
|
| 28169 |
+
"epoch": 3.476180212288225,
|
| 28170 |
+
"grad_norm": 0.09732919931411743,
|
| 28171 |
+
"learning_rate": 0.0001,
|
| 28172 |
+
"loss": 2.3476,
|
| 28173 |
+
"num_input_tokens_seen": 92524522048,
|
| 28174 |
+
"step": 158250
|
| 28175 |
+
},
|
| 28176 |
+
{
|
| 28177 |
+
"epoch": 3.4772785152042878,
|
| 28178 |
+
"grad_norm": 0.08927954733371735,
|
| 28179 |
+
"learning_rate": 0.0001,
|
| 28180 |
+
"loss": 2.349,
|
| 28181 |
+
"num_input_tokens_seen": 92576950848,
|
| 28182 |
+
"step": 158300
|
| 28183 |
+
},
|
| 28184 |
+
{
|
| 28185 |
+
"epoch": 3.4783768181203505,
|
| 28186 |
+
"grad_norm": 0.09306230396032333,
|
| 28187 |
+
"learning_rate": 0.0001,
|
| 28188 |
+
"loss": 2.3396,
|
| 28189 |
+
"num_input_tokens_seen": 92629379648,
|
| 28190 |
+
"step": 158350
|
| 28191 |
+
},
|
| 28192 |
+
{
|
| 28193 |
+
"epoch": 3.4794751210364137,
|
| 28194 |
+
"grad_norm": 0.09522947669029236,
|
| 28195 |
+
"learning_rate": 0.0001,
|
| 28196 |
+
"loss": 2.3394,
|
| 28197 |
+
"num_input_tokens_seen": 92681808448,
|
| 28198 |
+
"step": 158400
|
| 28199 |
+
},
|
| 28200 |
+
{
|
| 28201 |
+
"epoch": 3.4805734239524764,
|
| 28202 |
+
"grad_norm": 0.09624941647052765,
|
| 28203 |
+
"learning_rate": 0.0001,
|
| 28204 |
+
"loss": 2.3475,
|
| 28205 |
+
"num_input_tokens_seen": 92734237248,
|
| 28206 |
+
"step": 158450
|
| 28207 |
+
},
|
| 28208 |
+
{
|
| 28209 |
+
"epoch": 3.481671726868539,
|
| 28210 |
+
"grad_norm": 0.09459653496742249,
|
| 28211 |
+
"learning_rate": 0.0001,
|
| 28212 |
+
"loss": 2.3431,
|
| 28213 |
+
"num_input_tokens_seen": 92786665216,
|
| 28214 |
+
"step": 158500
|
| 28215 |
+
},
|
| 28216 |
+
{
|
| 28217 |
+
"epoch": 3.481671726868539,
|
| 28218 |
+
"eval_loss": 2.257422685623169,
|
| 28219 |
+
"eval_runtime": 80.6973,
|
| 28220 |
+
"eval_samples_per_second": 61.96,
|
| 28221 |
+
"eval_steps_per_second": 15.49,
|
| 28222 |
+
"num_input_tokens_seen": 92786665216,
|
| 28223 |
+
"step": 158500
|
| 28224 |
+
},
|
| 28225 |
+
{
|
| 28226 |
+
"epoch": 3.4827700297846023,
|
| 28227 |
+
"grad_norm": 0.09564249962568283,
|
| 28228 |
+
"learning_rate": 0.0001,
|
| 28229 |
+
"loss": 2.341,
|
| 28230 |
+
"num_input_tokens_seen": 92839093024,
|
| 28231 |
+
"step": 158550
|
| 28232 |
+
},
|
| 28233 |
+
{
|
| 28234 |
+
"epoch": 3.483868332700665,
|
| 28235 |
+
"grad_norm": 0.10864699631929398,
|
| 28236 |
+
"learning_rate": 0.0001,
|
| 28237 |
+
"loss": 2.3405,
|
| 28238 |
+
"num_input_tokens_seen": 92891521824,
|
| 28239 |
+
"step": 158600
|
| 28240 |
+
},
|
| 28241 |
+
{
|
| 28242 |
+
"epoch": 3.484966635616728,
|
| 28243 |
+
"grad_norm": 0.09777586907148361,
|
| 28244 |
+
"learning_rate": 0.0001,
|
| 28245 |
+
"loss": 2.3445,
|
| 28246 |
+
"num_input_tokens_seen": 92943950624,
|
| 28247 |
+
"step": 158650
|
| 28248 |
+
},
|
| 28249 |
+
{
|
| 28250 |
+
"epoch": 3.486064938532791,
|
| 28251 |
+
"grad_norm": 0.09032690525054932,
|
| 28252 |
+
"learning_rate": 0.0001,
|
| 28253 |
+
"loss": 2.3423,
|
| 28254 |
+
"num_input_tokens_seen": 92996375712,
|
| 28255 |
+
"step": 158700
|
| 28256 |
+
},
|
| 28257 |
+
{
|
| 28258 |
+
"epoch": 3.4871632414488536,
|
| 28259 |
+
"grad_norm": 0.09027489274740219,
|
| 28260 |
+
"learning_rate": 0.0001,
|
| 28261 |
+
"loss": 2.3412,
|
| 28262 |
+
"num_input_tokens_seen": 93048803136,
|
| 28263 |
+
"step": 158750
|
| 28264 |
+
},
|
| 28265 |
+
{
|
| 28266 |
+
"epoch": 3.4882615443649168,
|
| 28267 |
+
"grad_norm": 0.09923077374696732,
|
| 28268 |
+
"learning_rate": 0.0001,
|
| 28269 |
+
"loss": 2.3455,
|
| 28270 |
+
"num_input_tokens_seen": 93101231936,
|
| 28271 |
+
"step": 158800
|
| 28272 |
+
},
|
| 28273 |
+
{
|
| 28274 |
+
"epoch": 3.4893598472809795,
|
| 28275 |
+
"grad_norm": 0.10047315806150436,
|
| 28276 |
+
"learning_rate": 0.0001,
|
| 28277 |
+
"loss": 2.3416,
|
| 28278 |
+
"num_input_tokens_seen": 93153660736,
|
| 28279 |
+
"step": 158850
|
| 28280 |
+
},
|
| 28281 |
+
{
|
| 28282 |
+
"epoch": 3.4904581501970426,
|
| 28283 |
+
"grad_norm": 0.0912187322974205,
|
| 28284 |
+
"learning_rate": 0.0001,
|
| 28285 |
+
"loss": 2.3437,
|
| 28286 |
+
"num_input_tokens_seen": 93206089536,
|
| 28287 |
+
"step": 158900
|
| 28288 |
+
},
|
| 28289 |
+
{
|
| 28290 |
+
"epoch": 3.4915564531131054,
|
| 28291 |
+
"grad_norm": 0.09997432678937912,
|
| 28292 |
+
"learning_rate": 0.0001,
|
| 28293 |
+
"loss": 2.341,
|
| 28294 |
+
"num_input_tokens_seen": 93258518336,
|
| 28295 |
+
"step": 158950
|
| 28296 |
+
},
|
| 28297 |
+
{
|
| 28298 |
+
"epoch": 3.492654756029168,
|
| 28299 |
+
"grad_norm": 0.09082050621509552,
|
| 28300 |
+
"learning_rate": 0.0001,
|
| 28301 |
+
"loss": 2.3338,
|
| 28302 |
+
"num_input_tokens_seen": 93310947136,
|
| 28303 |
+
"step": 159000
|
| 28304 |
+
},
|
| 28305 |
+
{
|
| 28306 |
+
"epoch": 3.492654756029168,
|
| 28307 |
+
"eval_loss": 2.2572686672210693,
|
| 28308 |
+
"eval_runtime": 80.2123,
|
| 28309 |
+
"eval_samples_per_second": 62.335,
|
| 28310 |
+
"eval_steps_per_second": 15.584,
|
| 28311 |
+
"num_input_tokens_seen": 93310947136,
|
| 28312 |
+
"step": 159000
|
| 28313 |
}
|
| 28314 |
],
|
| 28315 |
"logging_steps": 50,
|
| 28316 |
"max_steps": 200000,
|
| 28317 |
+
"num_input_tokens_seen": 93310947136,
|
| 28318 |
"num_train_epochs": 5,
|
| 28319 |
"save_steps": 1000,
|
| 28320 |
"stateful_callbacks": {
|
|
|
|
| 28329 |
"attributes": {}
|
| 28330 |
}
|
| 28331 |
},
|
| 28332 |
+
"total_flos": 1.651433565139625e+20,
|
| 28333 |
"train_batch_size": 32,
|
| 28334 |
"trial_name": null,
|
| 28335 |
"trial_params": null
|