Training in progress, step 47000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f4e08ed2a6d62d28d840192a090317a05ca939879ecf26aa2b319d9c763f735
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b2e15feb0f7f3fe2709a8b7d31a3a5c543a260dee03048851f465de58a0a6ac
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef6d6c68b31cc97d3a7886b7338b6c21c45d7ba1c6c1b89db7e0a3456d53ecda
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30a691323d967d54c1c0f6fb771a9863c3def8ea94c66492bb5dbdffa3e83798
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8196,11 +8196,189 @@
|
|
| 8196 |
"eval_steps_per_second": 24.661,
|
| 8197 |
"num_input_tokens_seen": 12058619456,
|
| 8198 |
"step": 46000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8199 |
}
|
| 8200 |
],
|
| 8201 |
"logging_steps": 50,
|
| 8202 |
"max_steps": 70000,
|
| 8203 |
-
"num_input_tokens_seen":
|
| 8204 |
"num_train_epochs": 1,
|
| 8205 |
"save_steps": 1000,
|
| 8206 |
"stateful_callbacks": {
|
|
@@ -8215,7 +8393,7 @@
|
|
| 8215 |
"attributes": {}
|
| 8216 |
}
|
| 8217 |
},
|
| 8218 |
-
"total_flos": 3.
|
| 8219 |
"train_batch_size": 64,
|
| 8220 |
"trial_name": null,
|
| 8221 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.22419118260848825,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 47000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8196 |
"eval_steps_per_second": 24.661,
|
| 8197 |
"num_input_tokens_seen": 12058619456,
|
| 8198 |
"step": 46000
|
| 8199 |
+
},
|
| 8200 |
+
{
|
| 8201 |
+
"epoch": 0.21965965870469967,
|
| 8202 |
+
"grad_norm": 0.19492709636688232,
|
| 8203 |
+
"learning_rate": 0.001,
|
| 8204 |
+
"loss": 2.6196,
|
| 8205 |
+
"num_input_tokens_seen": 12071726656,
|
| 8206 |
+
"step": 46050
|
| 8207 |
+
},
|
| 8208 |
+
{
|
| 8209 |
+
"epoch": 0.2198981599627938,
|
| 8210 |
+
"grad_norm": 0.19643568992614746,
|
| 8211 |
+
"learning_rate": 0.001,
|
| 8212 |
+
"loss": 2.6108,
|
| 8213 |
+
"num_input_tokens_seen": 12084833856,
|
| 8214 |
+
"step": 46100
|
| 8215 |
+
},
|
| 8216 |
+
{
|
| 8217 |
+
"epoch": 0.22013666122088793,
|
| 8218 |
+
"grad_norm": 0.18720099329948425,
|
| 8219 |
+
"learning_rate": 0.001,
|
| 8220 |
+
"loss": 2.6181,
|
| 8221 |
+
"num_input_tokens_seen": 12097941056,
|
| 8222 |
+
"step": 46150
|
| 8223 |
+
},
|
| 8224 |
+
{
|
| 8225 |
+
"epoch": 0.2203751624789821,
|
| 8226 |
+
"grad_norm": 0.1929876208305359,
|
| 8227 |
+
"learning_rate": 0.001,
|
| 8228 |
+
"loss": 2.6152,
|
| 8229 |
+
"num_input_tokens_seen": 12111048256,
|
| 8230 |
+
"step": 46200
|
| 8231 |
+
},
|
| 8232 |
+
{
|
| 8233 |
+
"epoch": 0.22061366373707622,
|
| 8234 |
+
"grad_norm": 0.19732603430747986,
|
| 8235 |
+
"learning_rate": 0.001,
|
| 8236 |
+
"loss": 2.6267,
|
| 8237 |
+
"num_input_tokens_seen": 12124155456,
|
| 8238 |
+
"step": 46250
|
| 8239 |
+
},
|
| 8240 |
+
{
|
| 8241 |
+
"epoch": 0.22085216499517035,
|
| 8242 |
+
"grad_norm": 0.1964132934808731,
|
| 8243 |
+
"learning_rate": 0.001,
|
| 8244 |
+
"loss": 2.605,
|
| 8245 |
+
"num_input_tokens_seen": 12137262656,
|
| 8246 |
+
"step": 46300
|
| 8247 |
+
},
|
| 8248 |
+
{
|
| 8249 |
+
"epoch": 0.22109066625326448,
|
| 8250 |
+
"grad_norm": 0.1927288919687271,
|
| 8251 |
+
"learning_rate": 0.001,
|
| 8252 |
+
"loss": 2.6178,
|
| 8253 |
+
"num_input_tokens_seen": 12150369856,
|
| 8254 |
+
"step": 46350
|
| 8255 |
+
},
|
| 8256 |
+
{
|
| 8257 |
+
"epoch": 0.22132916751135862,
|
| 8258 |
+
"grad_norm": 0.17873398959636688,
|
| 8259 |
+
"learning_rate": 0.001,
|
| 8260 |
+
"loss": 2.6033,
|
| 8261 |
+
"num_input_tokens_seen": 12163477056,
|
| 8262 |
+
"step": 46400
|
| 8263 |
+
},
|
| 8264 |
+
{
|
| 8265 |
+
"epoch": 0.22156766876945275,
|
| 8266 |
+
"grad_norm": 0.24716190993785858,
|
| 8267 |
+
"learning_rate": 0.001,
|
| 8268 |
+
"loss": 2.6141,
|
| 8269 |
+
"num_input_tokens_seen": 12176584256,
|
| 8270 |
+
"step": 46450
|
| 8271 |
+
},
|
| 8272 |
+
{
|
| 8273 |
+
"epoch": 0.2218061700275469,
|
| 8274 |
+
"grad_norm": 0.2021339386701584,
|
| 8275 |
+
"learning_rate": 0.001,
|
| 8276 |
+
"loss": 2.6259,
|
| 8277 |
+
"num_input_tokens_seen": 12189691456,
|
| 8278 |
+
"step": 46500
|
| 8279 |
+
},
|
| 8280 |
+
{
|
| 8281 |
+
"epoch": 0.2218061700275469,
|
| 8282 |
+
"eval_loss": 2.4975087642669678,
|
| 8283 |
+
"eval_runtime": 50.8921,
|
| 8284 |
+
"eval_samples_per_second": 98.247,
|
| 8285 |
+
"eval_steps_per_second": 24.562,
|
| 8286 |
+
"num_input_tokens_seen": 12189691456,
|
| 8287 |
+
"step": 46500
|
| 8288 |
+
},
|
| 8289 |
+
{
|
| 8290 |
+
"epoch": 0.22204467128564104,
|
| 8291 |
+
"grad_norm": 0.20796166360378265,
|
| 8292 |
+
"learning_rate": 0.001,
|
| 8293 |
+
"loss": 2.6211,
|
| 8294 |
+
"num_input_tokens_seen": 12202798656,
|
| 8295 |
+
"step": 46550
|
| 8296 |
+
},
|
| 8297 |
+
{
|
| 8298 |
+
"epoch": 0.22228317254373517,
|
| 8299 |
+
"grad_norm": 0.20472556352615356,
|
| 8300 |
+
"learning_rate": 0.001,
|
| 8301 |
+
"loss": 2.6123,
|
| 8302 |
+
"num_input_tokens_seen": 12215905856,
|
| 8303 |
+
"step": 46600
|
| 8304 |
+
},
|
| 8305 |
+
{
|
| 8306 |
+
"epoch": 0.2225216738018293,
|
| 8307 |
+
"grad_norm": 0.20017485320568085,
|
| 8308 |
+
"learning_rate": 0.001,
|
| 8309 |
+
"loss": 2.6037,
|
| 8310 |
+
"num_input_tokens_seen": 12229013056,
|
| 8311 |
+
"step": 46650
|
| 8312 |
+
},
|
| 8313 |
+
{
|
| 8314 |
+
"epoch": 0.22276017505992343,
|
| 8315 |
+
"grad_norm": 0.2037762850522995,
|
| 8316 |
+
"learning_rate": 0.001,
|
| 8317 |
+
"loss": 2.6155,
|
| 8318 |
+
"num_input_tokens_seen": 12242120256,
|
| 8319 |
+
"step": 46700
|
| 8320 |
+
},
|
| 8321 |
+
{
|
| 8322 |
+
"epoch": 0.2229986763180176,
|
| 8323 |
+
"grad_norm": 0.19346804916858673,
|
| 8324 |
+
"learning_rate": 0.001,
|
| 8325 |
+
"loss": 2.601,
|
| 8326 |
+
"num_input_tokens_seen": 12255227456,
|
| 8327 |
+
"step": 46750
|
| 8328 |
+
},
|
| 8329 |
+
{
|
| 8330 |
+
"epoch": 0.22323717757611172,
|
| 8331 |
+
"grad_norm": 0.18640096485614777,
|
| 8332 |
+
"learning_rate": 0.001,
|
| 8333 |
+
"loss": 2.6168,
|
| 8334 |
+
"num_input_tokens_seen": 12268334656,
|
| 8335 |
+
"step": 46800
|
| 8336 |
+
},
|
| 8337 |
+
{
|
| 8338 |
+
"epoch": 0.22347567883420585,
|
| 8339 |
+
"grad_norm": 0.20295055210590363,
|
| 8340 |
+
"learning_rate": 0.001,
|
| 8341 |
+
"loss": 2.6221,
|
| 8342 |
+
"num_input_tokens_seen": 12281441856,
|
| 8343 |
+
"step": 46850
|
| 8344 |
+
},
|
| 8345 |
+
{
|
| 8346 |
+
"epoch": 0.22371418009229999,
|
| 8347 |
+
"grad_norm": 0.20705671608448029,
|
| 8348 |
+
"learning_rate": 0.001,
|
| 8349 |
+
"loss": 2.6202,
|
| 8350 |
+
"num_input_tokens_seen": 12294549056,
|
| 8351 |
+
"step": 46900
|
| 8352 |
+
},
|
| 8353 |
+
{
|
| 8354 |
+
"epoch": 0.22395268135039412,
|
| 8355 |
+
"grad_norm": 0.18724282085895538,
|
| 8356 |
+
"learning_rate": 0.001,
|
| 8357 |
+
"loss": 2.6061,
|
| 8358 |
+
"num_input_tokens_seen": 12307656256,
|
| 8359 |
+
"step": 46950
|
| 8360 |
+
},
|
| 8361 |
+
{
|
| 8362 |
+
"epoch": 0.22419118260848825,
|
| 8363 |
+
"grad_norm": 0.18210910260677338,
|
| 8364 |
+
"learning_rate": 0.001,
|
| 8365 |
+
"loss": 2.6045,
|
| 8366 |
+
"num_input_tokens_seen": 12320763456,
|
| 8367 |
+
"step": 47000
|
| 8368 |
+
},
|
| 8369 |
+
{
|
| 8370 |
+
"epoch": 0.22419118260848825,
|
| 8371 |
+
"eval_loss": 2.497344493865967,
|
| 8372 |
+
"eval_runtime": 51.17,
|
| 8373 |
+
"eval_samples_per_second": 97.713,
|
| 8374 |
+
"eval_steps_per_second": 24.428,
|
| 8375 |
+
"num_input_tokens_seen": 12320763456,
|
| 8376 |
+
"step": 47000
|
| 8377 |
}
|
| 8378 |
],
|
| 8379 |
"logging_steps": 50,
|
| 8380 |
"max_steps": 70000,
|
| 8381 |
+
"num_input_tokens_seen": 12320763456,
|
| 8382 |
"num_train_epochs": 1,
|
| 8383 |
"save_steps": 1000,
|
| 8384 |
"stateful_callbacks": {
|
|
|
|
| 8393 |
"attributes": {}
|
| 8394 |
}
|
| 8395 |
},
|
| 8396 |
+
"total_flos": 3.2959244751313306e+18,
|
| 8397 |
"train_batch_size": 64,
|
| 8398 |
"trial_name": null,
|
| 8399 |
"trial_params": null
|