Training in progress, step 65000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ad9801ec7b3ea03c8febaf16be0cca903ae6c5e7ba16db1d0ab836be5805c8b
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcaa21f2d1112b5786bb6cb8a7af07df0a486ccdc4e343d067ea09aba3ebc0cf
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5abe0ab18889dbab668e6d9fae1d62109a3226e616d0e681a91c9a668ea4330
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b166fab474c8d8470da4ff5d475f9ae65d65d8dd07f0e702e6e8c799bab73616
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11400,11 +11400,189 @@
|
|
| 11400 |
"eval_steps_per_second": 23.473,
|
| 11401 |
"num_input_tokens_seen": 16777211456,
|
| 11402 |
"step": 64000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11403 |
}
|
| 11404 |
],
|
| 11405 |
"logging_steps": 50,
|
| 11406 |
"max_steps": 70000,
|
| 11407 |
-
"num_input_tokens_seen":
|
| 11408 |
"num_train_epochs": 1,
|
| 11409 |
"save_steps": 1000,
|
| 11410 |
"stateful_callbacks": {
|
|
@@ -11419,7 +11597,7 @@
|
|
| 11419 |
"attributes": {}
|
| 11420 |
}
|
| 11421 |
},
|
| 11422 |
-
"total_flos": 4.
|
| 11423 |
"train_batch_size": 64,
|
| 11424 |
"trial_name": null,
|
| 11425 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.31005163552237736,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 65000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11400 |
"eval_steps_per_second": 23.473,
|
| 11401 |
"num_input_tokens_seen": 16777211456,
|
| 11402 |
"step": 64000
|
| 11403 |
+
},
|
| 11404 |
+
{
|
| 11405 |
+
"epoch": 0.3055201116185888,
|
| 11406 |
+
"grad_norm": 0.1703004688024521,
|
| 11407 |
+
"learning_rate": 0.00038327731807204744,
|
| 11408 |
+
"loss": 2.5506,
|
| 11409 |
+
"num_input_tokens_seen": 16790318656,
|
| 11410 |
+
"step": 64050
|
| 11411 |
+
},
|
| 11412 |
+
{
|
| 11413 |
+
"epoch": 0.30575861287668293,
|
| 11414 |
+
"grad_norm": 0.19769616425037384,
|
| 11415 |
+
"learning_rate": 0.00037782979693105293,
|
| 11416 |
+
"loss": 2.542,
|
| 11417 |
+
"num_input_tokens_seen": 16803425856,
|
| 11418 |
+
"step": 64100
|
| 11419 |
+
},
|
| 11420 |
+
{
|
| 11421 |
+
"epoch": 0.30599711413477704,
|
| 11422 |
+
"grad_norm": 0.20674961805343628,
|
| 11423 |
+
"learning_rate": 0.00037239765536817873,
|
| 11424 |
+
"loss": 2.539,
|
| 11425 |
+
"num_input_tokens_seen": 16816533056,
|
| 11426 |
+
"step": 64150
|
| 11427 |
+
},
|
| 11428 |
+
{
|
| 11429 |
+
"epoch": 0.3062356153928712,
|
| 11430 |
+
"grad_norm": 0.19121839106082916,
|
| 11431 |
+
"learning_rate": 0.0003669815772166625,
|
| 11432 |
+
"loss": 2.5573,
|
| 11433 |
+
"num_input_tokens_seen": 16829640256,
|
| 11434 |
+
"step": 64200
|
| 11435 |
+
},
|
| 11436 |
+
{
|
| 11437 |
+
"epoch": 0.30647411665096536,
|
| 11438 |
+
"grad_norm": 0.1734025925397873,
|
| 11439 |
+
"learning_rate": 0.00036158224428757535,
|
| 11440 |
+
"loss": 2.5416,
|
| 11441 |
+
"num_input_tokens_seen": 16842747456,
|
| 11442 |
+
"step": 64250
|
| 11443 |
+
},
|
| 11444 |
+
{
|
| 11445 |
+
"epoch": 0.30671261790905946,
|
| 11446 |
+
"grad_norm": 0.1857634037733078,
|
| 11447 |
+
"learning_rate": 0.0003562003362839914,
|
| 11448 |
+
"loss": 2.5652,
|
| 11449 |
+
"num_input_tokens_seen": 16855854656,
|
| 11450 |
+
"step": 64300
|
| 11451 |
+
},
|
| 11452 |
+
{
|
| 11453 |
+
"epoch": 0.3069511191671536,
|
| 11454 |
+
"grad_norm": 0.17733143270015717,
|
| 11455 |
+
"learning_rate": 0.000350836530715422,
|
| 11456 |
+
"loss": 2.5299,
|
| 11457 |
+
"num_input_tokens_seen": 16868961856,
|
| 11458 |
+
"step": 64350
|
| 11459 |
+
},
|
| 11460 |
+
{
|
| 11461 |
+
"epoch": 0.3071896204252477,
|
| 11462 |
+
"grad_norm": 0.18323005735874176,
|
| 11463 |
+
"learning_rate": 0.00034549150281252633,
|
| 11464 |
+
"loss": 2.5691,
|
| 11465 |
+
"num_input_tokens_seen": 16882069056,
|
| 11466 |
+
"step": 64400
|
| 11467 |
+
},
|
| 11468 |
+
{
|
| 11469 |
+
"epoch": 0.3074281216833419,
|
| 11470 |
+
"grad_norm": 0.18570365011692047,
|
| 11471 |
+
"learning_rate": 0.00034016592544210936,
|
| 11472 |
+
"loss": 2.5436,
|
| 11473 |
+
"num_input_tokens_seen": 16895176256,
|
| 11474 |
+
"step": 64450
|
| 11475 |
+
},
|
| 11476 |
+
{
|
| 11477 |
+
"epoch": 0.30766662294143604,
|
| 11478 |
+
"grad_norm": 0.18571798503398895,
|
| 11479 |
+
"learning_rate": 0.00033486046902241664,
|
| 11480 |
+
"loss": 2.5382,
|
| 11481 |
+
"num_input_tokens_seen": 16908283456,
|
| 11482 |
+
"step": 64500
|
| 11483 |
+
},
|
| 11484 |
+
{
|
| 11485 |
+
"epoch": 0.30766662294143604,
|
| 11486 |
+
"eval_loss": 2.4323015213012695,
|
| 11487 |
+
"eval_runtime": 53.7237,
|
| 11488 |
+
"eval_samples_per_second": 93.069,
|
| 11489 |
+
"eval_steps_per_second": 23.267,
|
| 11490 |
+
"num_input_tokens_seen": 16908283456,
|
| 11491 |
+
"step": 64500
|
| 11492 |
+
},
|
| 11493 |
+
{
|
| 11494 |
+
"epoch": 0.30790512419953014,
|
| 11495 |
+
"grad_norm": 0.1829528957605362,
|
| 11496 |
+
"learning_rate": 0.0003295758014387375,
|
| 11497 |
+
"loss": 2.5453,
|
| 11498 |
+
"num_input_tokens_seen": 16921390656,
|
| 11499 |
+
"step": 64550
|
| 11500 |
+
},
|
| 11501 |
+
{
|
| 11502 |
+
"epoch": 0.3081436254576243,
|
| 11503 |
+
"grad_norm": 0.1703086644411087,
|
| 11504 |
+
"learning_rate": 0.0003243125879593286,
|
| 11505 |
+
"loss": 2.5441,
|
| 11506 |
+
"num_input_tokens_seen": 16934497856,
|
| 11507 |
+
"step": 64600
|
| 11508 |
+
},
|
| 11509 |
+
{
|
| 11510 |
+
"epoch": 0.3083821267157184,
|
| 11511 |
+
"grad_norm": 0.17826180160045624,
|
| 11512 |
+
"learning_rate": 0.000319071491151664,
|
| 11513 |
+
"loss": 2.545,
|
| 11514 |
+
"num_input_tokens_seen": 16947605056,
|
| 11515 |
+
"step": 64650
|
| 11516 |
+
},
|
| 11517 |
+
{
|
| 11518 |
+
"epoch": 0.30862062797381257,
|
| 11519 |
+
"grad_norm": 0.17889030277729034,
|
| 11520 |
+
"learning_rate": 0.00031385317079902743,
|
| 11521 |
+
"loss": 2.5405,
|
| 11522 |
+
"num_input_tokens_seen": 16960712256,
|
| 11523 |
+
"step": 64700
|
| 11524 |
+
},
|
| 11525 |
+
{
|
| 11526 |
+
"epoch": 0.30885912923190667,
|
| 11527 |
+
"grad_norm": 0.1711336225271225,
|
| 11528 |
+
"learning_rate": 0.0003086582838174551,
|
| 11529 |
+
"loss": 2.5222,
|
| 11530 |
+
"num_input_tokens_seen": 16973819456,
|
| 11531 |
+
"step": 64750
|
| 11532 |
+
},
|
| 11533 |
+
{
|
| 11534 |
+
"epoch": 0.30909763049000083,
|
| 11535 |
+
"grad_norm": 0.17962214350700378,
|
| 11536 |
+
"learning_rate": 0.0003034874841730382,
|
| 11537 |
+
"loss": 2.5376,
|
| 11538 |
+
"num_input_tokens_seen": 16986926656,
|
| 11539 |
+
"step": 64800
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 0.309336131748095,
|
| 11543 |
+
"grad_norm": 0.1699627935886383,
|
| 11544 |
+
"learning_rate": 0.0002983414227995975,
|
| 11545 |
+
"loss": 2.5616,
|
| 11546 |
+
"num_input_tokens_seen": 17000033856,
|
| 11547 |
+
"step": 64850
|
| 11548 |
+
},
|
| 11549 |
+
{
|
| 11550 |
+
"epoch": 0.3095746330061891,
|
| 11551 |
+
"grad_norm": 0.18442535400390625,
|
| 11552 |
+
"learning_rate": 0.00029322074751673977,
|
| 11553 |
+
"loss": 2.5377,
|
| 11554 |
+
"num_input_tokens_seen": 17013141056,
|
| 11555 |
+
"step": 64900
|
| 11556 |
+
},
|
| 11557 |
+
{
|
| 11558 |
+
"epoch": 0.30981313426428325,
|
| 11559 |
+
"grad_norm": 0.17972196638584137,
|
| 11560 |
+
"learning_rate": 0.0002881261029483057,
|
| 11561 |
+
"loss": 2.5474,
|
| 11562 |
+
"num_input_tokens_seen": 17026248256,
|
| 11563 |
+
"step": 64950
|
| 11564 |
+
},
|
| 11565 |
+
{
|
| 11566 |
+
"epoch": 0.31005163552237736,
|
| 11567 |
+
"grad_norm": 0.1810217946767807,
|
| 11568 |
+
"learning_rate": 0.00028305813044122096,
|
| 11569 |
+
"loss": 2.5286,
|
| 11570 |
+
"num_input_tokens_seen": 17039355456,
|
| 11571 |
+
"step": 65000
|
| 11572 |
+
},
|
| 11573 |
+
{
|
| 11574 |
+
"epoch": 0.31005163552237736,
|
| 11575 |
+
"eval_loss": 2.4292306900024414,
|
| 11576 |
+
"eval_runtime": 53.3956,
|
| 11577 |
+
"eval_samples_per_second": 93.641,
|
| 11578 |
+
"eval_steps_per_second": 23.41,
|
| 11579 |
+
"num_input_tokens_seen": 17039355456,
|
| 11580 |
+
"step": 65000
|
| 11581 |
}
|
| 11582 |
],
|
| 11583 |
"logging_steps": 50,
|
| 11584 |
"max_steps": 70000,
|
| 11585 |
+
"num_input_tokens_seen": 17039355456,
|
| 11586 |
"num_train_epochs": 1,
|
| 11587 |
"save_steps": 1000,
|
| 11588 |
"stateful_callbacks": {
|
|
|
|
| 11597 |
"attributes": {}
|
| 11598 |
}
|
| 11599 |
},
|
| 11600 |
+
"total_flos": 4.5581938885892506e+18,
|
| 11601 |
"train_batch_size": 64,
|
| 11602 |
"trial_name": null,
|
| 11603 |
"trial_params": null
|