Training in progress, step 133000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92ad31cc8051a774ff84bf50a2f043b12568d60c659ab713450ad489e60ff067
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f98fbf6f84fc645d4e9351e4872ab3409232339169c895981f2ca6168553f54
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eda9968c0f9e110957e79edd3603196e5c46bdd8acc1a9a916fa49100e905254
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c40f5e3cc10bc35190c452a89f96d672b73ffd5edfe6d4e72f9d0b88f5a7c9a
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -23504,11 +23504,189 @@
|
|
| 23504 |
"eval_steps_per_second": 15.132,
|
| 23505 |
"num_input_tokens_seen": 69194840608,
|
| 23506 |
"step": 132000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23507 |
}
|
| 23508 |
],
|
| 23509 |
"logging_steps": 50,
|
| 23510 |
"max_steps": 140000,
|
| 23511 |
-
"num_input_tokens_seen":
|
| 23512 |
"num_train_epochs": 2,
|
| 23513 |
"save_steps": 1000,
|
| 23514 |
"stateful_callbacks": {
|
|
@@ -23523,7 +23701,7 @@
|
|
| 23523 |
"attributes": {}
|
| 23524 |
}
|
| 23525 |
},
|
| 23526 |
-
"total_flos": 1.
|
| 23527 |
"train_batch_size": 32,
|
| 23528 |
"trial_name": null,
|
| 23529 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2688290780733869,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 133000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 23504 |
"eval_steps_per_second": 15.132,
|
| 23505 |
"num_input_tokens_seen": 69194840608,
|
| 23506 |
"step": 132000
|
| 23507 |
+
},
|
| 23508 |
+
{
|
| 23509 |
+
"epoch": 1.2597660302658096,
|
| 23510 |
+
"grad_norm": 0.11991748213768005,
|
| 23511 |
+
"learning_rate": 0.0001860669839912626,
|
| 23512 |
+
"loss": 2.0354,
|
| 23513 |
+
"num_input_tokens_seen": 69221050496,
|
| 23514 |
+
"step": 132050
|
| 23515 |
+
},
|
| 23516 |
+
{
|
| 23517 |
+
"epoch": 1.260243032781998,
|
| 23518 |
+
"grad_norm": 0.11859247088432312,
|
| 23519 |
+
"learning_rate": 0.00018388874897104518,
|
| 23520 |
+
"loss": 2.0449,
|
| 23521 |
+
"num_input_tokens_seen": 69247257536,
|
| 23522 |
+
"step": 132100
|
| 23523 |
+
},
|
| 23524 |
+
{
|
| 23525 |
+
"epoch": 1.2607200352981862,
|
| 23526 |
+
"grad_norm": 0.12269642949104309,
|
| 23527 |
+
"learning_rate": 0.00018172046256311088,
|
| 23528 |
+
"loss": 2.0427,
|
| 23529 |
+
"num_input_tokens_seen": 69273469824,
|
| 23530 |
+
"step": 132150
|
| 23531 |
+
},
|
| 23532 |
+
{
|
| 23533 |
+
"epoch": 1.2611970378143744,
|
| 23534 |
+
"grad_norm": 0.11893275380134583,
|
| 23535 |
+
"learning_rate": 0.00017956219300748795,
|
| 23536 |
+
"loss": 2.0366,
|
| 23537 |
+
"num_input_tokens_seen": 69299684224,
|
| 23538 |
+
"step": 132200
|
| 23539 |
+
},
|
| 23540 |
+
{
|
| 23541 |
+
"epoch": 1.2616740403305626,
|
| 23542 |
+
"grad_norm": 0.12191104143857956,
|
| 23543 |
+
"learning_rate": 0.0001774140082289563,
|
| 23544 |
+
"loss": 2.0393,
|
| 23545 |
+
"num_input_tokens_seen": 69325894496,
|
| 23546 |
+
"step": 132250
|
| 23547 |
+
},
|
| 23548 |
+
{
|
| 23549 |
+
"epoch": 1.262151042846751,
|
| 23550 |
+
"grad_norm": 0.12704069912433624,
|
| 23551 |
+
"learning_rate": 0.00017527597583490823,
|
| 23552 |
+
"loss": 2.0551,
|
| 23553 |
+
"num_input_tokens_seen": 69352101952,
|
| 23554 |
+
"step": 132300
|
| 23555 |
+
},
|
| 23556 |
+
{
|
| 23557 |
+
"epoch": 1.2626280453629393,
|
| 23558 |
+
"grad_norm": 0.12682849168777466,
|
| 23559 |
+
"learning_rate": 0.00017314816311322218,
|
| 23560 |
+
"loss": 2.0376,
|
| 23561 |
+
"num_input_tokens_seen": 69378314752,
|
| 23562 |
+
"step": 132350
|
| 23563 |
+
},
|
| 23564 |
+
{
|
| 23565 |
+
"epoch": 1.2631050478791277,
|
| 23566 |
+
"grad_norm": 0.1246429830789566,
|
| 23567 |
+
"learning_rate": 0.00017103063703014372,
|
| 23568 |
+
"loss": 2.0402,
|
| 23569 |
+
"num_input_tokens_seen": 69404523776,
|
| 23570 |
+
"step": 132400
|
| 23571 |
+
},
|
| 23572 |
+
{
|
| 23573 |
+
"epoch": 1.263582050395316,
|
| 23574 |
+
"grad_norm": 0.12006555497646332,
|
| 23575 |
+
"learning_rate": 0.00016892346422817944,
|
| 23576 |
+
"loss": 2.0383,
|
| 23577 |
+
"num_input_tokens_seen": 69430732160,
|
| 23578 |
+
"step": 132450
|
| 23579 |
+
},
|
| 23580 |
+
{
|
| 23581 |
+
"epoch": 1.264059052911504,
|
| 23582 |
+
"grad_norm": 0.12435656785964966,
|
| 23583 |
+
"learning_rate": 0.00016682671102399805,
|
| 23584 |
+
"loss": 2.0347,
|
| 23585 |
+
"num_input_tokens_seen": 69456943424,
|
| 23586 |
+
"step": 132500
|
| 23587 |
+
},
|
| 23588 |
+
{
|
| 23589 |
+
"epoch": 1.264059052911504,
|
| 23590 |
+
"eval_loss": 1.9590063095092773,
|
| 23591 |
+
"eval_runtime": 82.7888,
|
| 23592 |
+
"eval_samples_per_second": 60.395,
|
| 23593 |
+
"eval_steps_per_second": 15.099,
|
| 23594 |
+
"num_input_tokens_seen": 69456943424,
|
| 23595 |
+
"step": 132500
|
| 23596 |
+
},
|
| 23597 |
+
{
|
| 23598 |
+
"epoch": 1.2645360554276923,
|
| 23599 |
+
"grad_norm": 0.12412598729133606,
|
| 23600 |
+
"learning_rate": 0.0001647404434063447,
|
| 23601 |
+
"loss": 2.0436,
|
| 23602 |
+
"num_input_tokens_seen": 69483146688,
|
| 23603 |
+
"step": 132550
|
| 23604 |
+
},
|
| 23605 |
+
{
|
| 23606 |
+
"epoch": 1.2650130579438805,
|
| 23607 |
+
"grad_norm": 0.12309623509645462,
|
| 23608 |
+
"learning_rate": 0.00016266472703396284,
|
| 23609 |
+
"loss": 2.028,
|
| 23610 |
+
"num_input_tokens_seen": 69509359968,
|
| 23611 |
+
"step": 132600
|
| 23612 |
+
},
|
| 23613 |
+
{
|
| 23614 |
+
"epoch": 1.265490060460069,
|
| 23615 |
+
"grad_norm": 0.12758532166481018,
|
| 23616 |
+
"learning_rate": 0.0001605996272335291,
|
| 23617 |
+
"loss": 2.041,
|
| 23618 |
+
"num_input_tokens_seen": 69535568960,
|
| 23619 |
+
"step": 132650
|
| 23620 |
+
},
|
| 23621 |
+
{
|
| 23622 |
+
"epoch": 1.2659670629762572,
|
| 23623 |
+
"grad_norm": 0.11922606080770493,
|
| 23624 |
+
"learning_rate": 0.00015854520899759655,
|
| 23625 |
+
"loss": 2.0308,
|
| 23626 |
+
"num_input_tokens_seen": 69561777024,
|
| 23627 |
+
"step": 132700
|
| 23628 |
+
},
|
| 23629 |
+
{
|
| 23630 |
+
"epoch": 1.2664440654924456,
|
| 23631 |
+
"grad_norm": 0.1239946112036705,
|
| 23632 |
+
"learning_rate": 0.00015650153698254916,
|
| 23633 |
+
"loss": 2.0336,
|
| 23634 |
+
"num_input_tokens_seen": 69587981952,
|
| 23635 |
+
"step": 132750
|
| 23636 |
+
},
|
| 23637 |
+
{
|
| 23638 |
+
"epoch": 1.2669210680086338,
|
| 23639 |
+
"grad_norm": 0.12584541738033295,
|
| 23640 |
+
"learning_rate": 0.00015446867550656767,
|
| 23641 |
+
"loss": 2.0376,
|
| 23642 |
+
"num_input_tokens_seen": 69614192832,
|
| 23643 |
+
"step": 132800
|
| 23644 |
+
},
|
| 23645 |
+
{
|
| 23646 |
+
"epoch": 1.267398070524822,
|
| 23647 |
+
"grad_norm": 0.12514598667621613,
|
| 23648 |
+
"learning_rate": 0.00015244668854760458,
|
| 23649 |
+
"loss": 2.0411,
|
| 23650 |
+
"num_input_tokens_seen": 69640405600,
|
| 23651 |
+
"step": 132850
|
| 23652 |
+
},
|
| 23653 |
+
{
|
| 23654 |
+
"epoch": 1.2678750730410102,
|
| 23655 |
+
"grad_norm": 0.12181352823972702,
|
| 23656 |
+
"learning_rate": 0.00015043563974137132,
|
| 23657 |
+
"loss": 2.0404,
|
| 23658 |
+
"num_input_tokens_seen": 69666619040,
|
| 23659 |
+
"step": 132900
|
| 23660 |
+
},
|
| 23661 |
+
{
|
| 23662 |
+
"epoch": 1.2683520755571986,
|
| 23663 |
+
"grad_norm": 0.11871461570262909,
|
| 23664 |
+
"learning_rate": 0.00014843559237933475,
|
| 23665 |
+
"loss": 2.0458,
|
| 23666 |
+
"num_input_tokens_seen": 69692833440,
|
| 23667 |
+
"step": 132950
|
| 23668 |
+
},
|
| 23669 |
+
{
|
| 23670 |
+
"epoch": 1.2688290780733869,
|
| 23671 |
+
"grad_norm": 0.12271245568990707,
|
| 23672 |
+
"learning_rate": 0.00014644660940672628,
|
| 23673 |
+
"loss": 2.0354,
|
| 23674 |
+
"num_input_tokens_seen": 69719047840,
|
| 23675 |
+
"step": 133000
|
| 23676 |
+
},
|
| 23677 |
+
{
|
| 23678 |
+
"epoch": 1.2688290780733869,
|
| 23679 |
+
"eval_loss": 1.9576880931854248,
|
| 23680 |
+
"eval_runtime": 82.558,
|
| 23681 |
+
"eval_samples_per_second": 60.564,
|
| 23682 |
+
"eval_steps_per_second": 15.141,
|
| 23683 |
+
"num_input_tokens_seen": 69719047840,
|
| 23684 |
+
"step": 133000
|
| 23685 |
}
|
| 23686 |
],
|
| 23687 |
"logging_steps": 50,
|
| 23688 |
"max_steps": 140000,
|
| 23689 |
+
"num_input_tokens_seen": 69719047840,
|
| 23690 |
"num_train_epochs": 2,
|
| 23691 |
"save_steps": 1000,
|
| 23692 |
"stateful_callbacks": {
|
|
|
|
| 23701 |
"attributes": {}
|
| 23702 |
}
|
| 23703 |
},
|
| 23704 |
+
"total_flos": 1.2338999792247398e+20,
|
| 23705 |
"train_batch_size": 32,
|
| 23706 |
"trial_name": null,
|
| 23707 |
"trial_params": null
|