Training in progress, step 65000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97f833e77e28bcce2d00fc8f583d642be803be2e4268c16065f001da61ccfb12
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6fb466dd570b07209b2b66d3759663a3b462b568c13bb8f7963bf1191bda0a0
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5abe0ab18889dbab668e6d9fae1d62109a3226e616d0e681a91c9a668ea4330
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83439c671f875b1f809ad8f03d85b4a006312176c0266e869dc1f2efa804bb73
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11400,11 +11400,189 @@
|
|
| 11400 |
"eval_steps_per_second": 23.518,
|
| 11401 |
"num_input_tokens_seen": 16777216000,
|
| 11402 |
"step": 64000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11403 |
}
|
| 11404 |
],
|
| 11405 |
"logging_steps": 50,
|
| 11406 |
"max_steps": 70000,
|
| 11407 |
-
"num_input_tokens_seen":
|
| 11408 |
"num_train_epochs": 1,
|
| 11409 |
"save_steps": 1000,
|
| 11410 |
"stateful_callbacks": {
|
|
@@ -11419,7 +11597,7 @@
|
|
| 11419 |
"attributes": {}
|
| 11420 |
}
|
| 11421 |
},
|
| 11422 |
-
"total_flos": 4.
|
| 11423 |
"train_batch_size": 64,
|
| 11424 |
"trial_name": null,
|
| 11425 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4372224268198963,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 65000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11400 |
"eval_steps_per_second": 23.518,
|
| 11401 |
"num_input_tokens_seen": 16777216000,
|
| 11402 |
"step": 64000
|
| 11403 |
+
},
|
| 11404 |
+
{
|
| 11405 |
+
"epoch": 0.43083225288945165,
|
| 11406 |
+
"grad_norm": 0.15883377194404602,
|
| 11407 |
+
"learning_rate": 0.000304132494574022,
|
| 11408 |
+
"loss": 2.9851,
|
| 11409 |
+
"num_input_tokens_seen": 16790323200,
|
| 11410 |
+
"step": 64050
|
| 11411 |
+
},
|
| 11412 |
+
{
|
| 11413 |
+
"epoch": 0.43116857783315926,
|
| 11414 |
+
"grad_norm": 0.176467627286911,
|
| 11415 |
+
"learning_rate": 0.00029962558344842963,
|
| 11416 |
+
"loss": 2.9865,
|
| 11417 |
+
"num_input_tokens_seen": 16803430400,
|
| 11418 |
+
"step": 64100
|
| 11419 |
+
},
|
| 11420 |
+
{
|
| 11421 |
+
"epoch": 0.43150490277686687,
|
| 11422 |
+
"grad_norm": 0.16392388939857483,
|
| 11423 |
+
"learning_rate": 0.00029513798482615227,
|
| 11424 |
+
"loss": 2.9788,
|
| 11425 |
+
"num_input_tokens_seen": 16816537600,
|
| 11426 |
+
"step": 64150
|
| 11427 |
+
},
|
| 11428 |
+
{
|
| 11429 |
+
"epoch": 0.4318412277205745,
|
| 11430 |
+
"grad_norm": 0.15614169836044312,
|
| 11431 |
+
"learning_rate": 0.0002906701312312861,
|
| 11432 |
+
"loss": 2.9769,
|
| 11433 |
+
"num_input_tokens_seen": 16829644800,
|
| 11434 |
+
"step": 64200
|
| 11435 |
+
},
|
| 11436 |
+
{
|
| 11437 |
+
"epoch": 0.43217755266428215,
|
| 11438 |
+
"grad_norm": 0.16225555539131165,
|
| 11439 |
+
"learning_rate": 0.00028622245328485907,
|
| 11440 |
+
"loss": 2.9881,
|
| 11441 |
+
"num_input_tokens_seen": 16842752000,
|
| 11442 |
+
"step": 64250
|
| 11443 |
+
},
|
| 11444 |
+
{
|
| 11445 |
+
"epoch": 0.43251387760798976,
|
| 11446 |
+
"grad_norm": 0.16419048607349396,
|
| 11447 |
+
"learning_rate": 0.0002817953796633289,
|
| 11448 |
+
"loss": 2.99,
|
| 11449 |
+
"num_input_tokens_seen": 16855859200,
|
| 11450 |
+
"step": 64300
|
| 11451 |
+
},
|
| 11452 |
+
{
|
| 11453 |
+
"epoch": 0.43285020255169737,
|
| 11454 |
+
"grad_norm": 0.16654469072818756,
|
| 11455 |
+
"learning_rate": 0.000277389337057266,
|
| 11456 |
+
"loss": 2.9919,
|
| 11457 |
+
"num_input_tokens_seen": 16868966400,
|
| 11458 |
+
"step": 64350
|
| 11459 |
+
},
|
| 11460 |
+
{
|
| 11461 |
+
"epoch": 0.433186527495405,
|
| 11462 |
+
"grad_norm": 0.1688661277294159,
|
| 11463 |
+
"learning_rate": 0.00027300475013022663,
|
| 11464 |
+
"loss": 2.9844,
|
| 11465 |
+
"num_input_tokens_seen": 16882073600,
|
| 11466 |
+
"step": 64400
|
| 11467 |
+
},
|
| 11468 |
+
{
|
| 11469 |
+
"epoch": 0.4335228524391126,
|
| 11470 |
+
"grad_norm": 0.162180095911026,
|
| 11471 |
+
"learning_rate": 0.000268642041477825,
|
| 11472 |
+
"loss": 2.9847,
|
| 11473 |
+
"num_input_tokens_seen": 16895180800,
|
| 11474 |
+
"step": 64450
|
| 11475 |
+
},
|
| 11476 |
+
{
|
| 11477 |
+
"epoch": 0.4338591773828202,
|
| 11478 |
+
"grad_norm": 0.18244421482086182,
|
| 11479 |
+
"learning_rate": 0.00026430163158700117,
|
| 11480 |
+
"loss": 2.9789,
|
| 11481 |
+
"num_input_tokens_seen": 16908288000,
|
| 11482 |
+
"step": 64500
|
| 11483 |
+
},
|
| 11484 |
+
{
|
| 11485 |
+
"epoch": 0.4338591773828202,
|
| 11486 |
+
"eval_loss": 2.8813860416412354,
|
| 11487 |
+
"eval_runtime": 53.1806,
|
| 11488 |
+
"eval_samples_per_second": 94.019,
|
| 11489 |
+
"eval_steps_per_second": 23.505,
|
| 11490 |
+
"num_input_tokens_seen": 16908288000,
|
| 11491 |
+
"step": 64500
|
| 11492 |
+
},
|
| 11493 |
+
{
|
| 11494 |
+
"epoch": 0.4341955023265278,
|
| 11495 |
+
"grad_norm": 0.15887753665447235,
|
| 11496 |
+
"learning_rate": 0.00025998393879549445,
|
| 11497 |
+
"loss": 2.9723,
|
| 11498 |
+
"num_input_tokens_seen": 16921395200,
|
| 11499 |
+
"step": 64550
|
| 11500 |
+
},
|
| 11501 |
+
{
|
| 11502 |
+
"epoch": 0.4345318272702354,
|
| 11503 |
+
"grad_norm": 0.17573221027851105,
|
| 11504 |
+
"learning_rate": 0.0002556893792515227,
|
| 11505 |
+
"loss": 2.99,
|
| 11506 |
+
"num_input_tokens_seen": 16934502400,
|
| 11507 |
+
"step": 64600
|
| 11508 |
+
},
|
| 11509 |
+
{
|
| 11510 |
+
"epoch": 0.43486815221394304,
|
| 11511 |
+
"grad_norm": 0.1790430247783661,
|
| 11512 |
+
"learning_rate": 0.0002514183668736727,
|
| 11513 |
+
"loss": 2.9887,
|
| 11514 |
+
"num_input_tokens_seen": 16947609600,
|
| 11515 |
+
"step": 64650
|
| 11516 |
+
},
|
| 11517 |
+
{
|
| 11518 |
+
"epoch": 0.43520447715765065,
|
| 11519 |
+
"grad_norm": 0.16031622886657715,
|
| 11520 |
+
"learning_rate": 0.0002471713133110078,
|
| 11521 |
+
"loss": 2.9835,
|
| 11522 |
+
"num_input_tokens_seen": 16960716800,
|
| 11523 |
+
"step": 64700
|
| 11524 |
+
},
|
| 11525 |
+
{
|
| 11526 |
+
"epoch": 0.43554080210135826,
|
| 11527 |
+
"grad_norm": 0.1702345311641693,
|
| 11528 |
+
"learning_rate": 0.0002429486279033892,
|
| 11529 |
+
"loss": 2.9862,
|
| 11530 |
+
"num_input_tokens_seen": 16973824000,
|
| 11531 |
+
"step": 64750
|
| 11532 |
+
},
|
| 11533 |
+
{
|
| 11534 |
+
"epoch": 0.43587712704506587,
|
| 11535 |
+
"grad_norm": 0.16080138087272644,
|
| 11536 |
+
"learning_rate": 0.00023875071764202561,
|
| 11537 |
+
"loss": 2.9785,
|
| 11538 |
+
"num_input_tokens_seen": 16986931200,
|
| 11539 |
+
"step": 64800
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 0.4362134519887735,
|
| 11543 |
+
"grad_norm": 0.17694465816020966,
|
| 11544 |
+
"learning_rate": 0.0002345779871302453,
|
| 11545 |
+
"loss": 2.9962,
|
| 11546 |
+
"num_input_tokens_seen": 17000038400,
|
| 11547 |
+
"step": 64850
|
| 11548 |
+
},
|
| 11549 |
+
{
|
| 11550 |
+
"epoch": 0.4365497769324811,
|
| 11551 |
+
"grad_norm": 0.15310978889465332,
|
| 11552 |
+
"learning_rate": 0.00023043083854449987,
|
| 11553 |
+
"loss": 2.98,
|
| 11554 |
+
"num_input_tokens_seen": 17013145600,
|
| 11555 |
+
"step": 64900
|
| 11556 |
+
},
|
| 11557 |
+
{
|
| 11558 |
+
"epoch": 0.4368861018761887,
|
| 11559 |
+
"grad_norm": 0.15505504608154297,
|
| 11560 |
+
"learning_rate": 0.0002263096715956019,
|
| 11561 |
+
"loss": 2.9825,
|
| 11562 |
+
"num_input_tokens_seen": 17026252800,
|
| 11563 |
+
"step": 64950
|
| 11564 |
+
},
|
| 11565 |
+
{
|
| 11566 |
+
"epoch": 0.4372224268198963,
|
| 11567 |
+
"grad_norm": 0.15211448073387146,
|
| 11568 |
+
"learning_rate": 0.00022221488349019903,
|
| 11569 |
+
"loss": 2.9876,
|
| 11570 |
+
"num_input_tokens_seen": 17039360000,
|
| 11571 |
+
"step": 65000
|
| 11572 |
+
},
|
| 11573 |
+
{
|
| 11574 |
+
"epoch": 0.4372224268198963,
|
| 11575 |
+
"eval_loss": 2.8792829513549805,
|
| 11576 |
+
"eval_runtime": 53.0249,
|
| 11577 |
+
"eval_samples_per_second": 94.295,
|
| 11578 |
+
"eval_steps_per_second": 23.574,
|
| 11579 |
+
"num_input_tokens_seen": 17039360000,
|
| 11580 |
+
"step": 65000
|
| 11581 |
}
|
| 11582 |
],
|
| 11583 |
"logging_steps": 50,
|
| 11584 |
"max_steps": 70000,
|
| 11585 |
+
"num_input_tokens_seen": 17039360000,
|
| 11586 |
"num_train_epochs": 1,
|
| 11587 |
"save_steps": 1000,
|
| 11588 |
"stateful_callbacks": {
|
|
|
|
| 11597 |
"attributes": {}
|
| 11598 |
}
|
| 11599 |
},
|
| 11600 |
+
"total_flos": 4.5581951041536e+18,
|
| 11601 |
"train_batch_size": 64,
|
| 11602 |
"trial_name": null,
|
| 11603 |
"trial_params": null
|