Training in progress, step 60000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3d4eb10327c6f996a0988361f6ad9bbab09e394aba34b1a396d7082da2216c0
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1da98e221b67155367bda2e5baaef41263bc46b4743e333b4e678859da5c6df
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6a4cb233f004dcf5c1bd7310c625e6acfeb53e49f5aa9a513759dc7631fff0b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be823a58640077d89dc450d2caf77b9f9c93851d1d9a6e787b2d5f1c9c9930be
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10510,11 +10510,189 @@
|
|
| 10510 |
"eval_steps_per_second": 23.434,
|
| 10511 |
"num_input_tokens_seen": 15466491456,
|
| 10512 |
"step": 59000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10513 |
}
|
| 10514 |
],
|
| 10515 |
"logging_steps": 50,
|
| 10516 |
"max_steps": 70000,
|
| 10517 |
-
"num_input_tokens_seen":
|
| 10518 |
"num_train_epochs": 1,
|
| 10519 |
"save_steps": 1000,
|
| 10520 |
"stateful_callbacks": {
|
|
@@ -10529,7 +10707,7 @@
|
|
| 10529 |
"attributes": {}
|
| 10530 |
}
|
| 10531 |
},
|
| 10532 |
-
"total_flos": 4.
|
| 10533 |
"train_batch_size": 64,
|
| 10534 |
"trial_name": null,
|
| 10535 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2862015097129637,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 60000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10510 |
"eval_steps_per_second": 23.434,
|
| 10511 |
"num_input_tokens_seen": 15466491456,
|
| 10512 |
"step": 59000
|
| 10513 |
+
},
|
| 10514 |
+
{
|
| 10515 |
+
"epoch": 0.2816699858091751,
|
| 10516 |
+
"grad_norm": 0.3338637351989746,
|
| 10517 |
+
"learning_rate": 0.0008873934395068005,
|
| 10518 |
+
"loss": 2.587,
|
| 10519 |
+
"num_input_tokens_seen": 15479598656,
|
| 10520 |
+
"step": 59050
|
| 10521 |
+
},
|
| 10522 |
+
{
|
| 10523 |
+
"epoch": 0.2819084870672693,
|
| 10524 |
+
"grad_norm": 0.20848780870437622,
|
| 10525 |
+
"learning_rate": 0.0008838223701790055,
|
| 10526 |
+
"loss": 2.5989,
|
| 10527 |
+
"num_input_tokens_seen": 15492705856,
|
| 10528 |
+
"step": 59100
|
| 10529 |
+
},
|
| 10530 |
+
{
|
| 10531 |
+
"epoch": 0.2821469883253634,
|
| 10532 |
+
"grad_norm": 0.21479378640651703,
|
| 10533 |
+
"learning_rate": 0.0008802029828000156,
|
| 10534 |
+
"loss": 2.6052,
|
| 10535 |
+
"num_input_tokens_seen": 15505813056,
|
| 10536 |
+
"step": 59150
|
| 10537 |
+
},
|
| 10538 |
+
{
|
| 10539 |
+
"epoch": 0.28238548958345755,
|
| 10540 |
+
"grad_norm": 0.1944151073694229,
|
| 10541 |
+
"learning_rate": 0.0008765357330018055,
|
| 10542 |
+
"loss": 2.6044,
|
| 10543 |
+
"num_input_tokens_seen": 15518920256,
|
| 10544 |
+
"step": 59200
|
| 10545 |
+
},
|
| 10546 |
+
{
|
| 10547 |
+
"epoch": 0.2826239908415517,
|
| 10548 |
+
"grad_norm": 0.2078033685684204,
|
| 10549 |
+
"learning_rate": 0.0008728210824415827,
|
| 10550 |
+
"loss": 2.5929,
|
| 10551 |
+
"num_input_tokens_seen": 15532027456,
|
| 10552 |
+
"step": 59250
|
| 10553 |
+
},
|
| 10554 |
+
{
|
| 10555 |
+
"epoch": 0.2828624920996458,
|
| 10556 |
+
"grad_norm": 0.19340284168720245,
|
| 10557 |
+
"learning_rate": 0.0008690594987436704,
|
| 10558 |
+
"loss": 2.5875,
|
| 10559 |
+
"num_input_tokens_seen": 15545134656,
|
| 10560 |
+
"step": 59300
|
| 10561 |
+
},
|
| 10562 |
+
{
|
| 10563 |
+
"epoch": 0.28310099335773997,
|
| 10564 |
+
"grad_norm": 0.22354012727737427,
|
| 10565 |
+
"learning_rate": 0.0008652514554406388,
|
| 10566 |
+
"loss": 2.5976,
|
| 10567 |
+
"num_input_tokens_seen": 15558241856,
|
| 10568 |
+
"step": 59350
|
| 10569 |
+
},
|
| 10570 |
+
{
|
| 10571 |
+
"epoch": 0.2833394946158341,
|
| 10572 |
+
"grad_norm": 0.26784005761146545,
|
| 10573 |
+
"learning_rate": 0.0008613974319136957,
|
| 10574 |
+
"loss": 2.5868,
|
| 10575 |
+
"num_input_tokens_seen": 15571349056,
|
| 10576 |
+
"step": 59400
|
| 10577 |
+
},
|
| 10578 |
+
{
|
| 10579 |
+
"epoch": 0.28357799587392823,
|
| 10580 |
+
"grad_norm": 0.20749828219413757,
|
| 10581 |
+
"learning_rate": 0.0008574979133323377,
|
| 10582 |
+
"loss": 2.5784,
|
| 10583 |
+
"num_input_tokens_seen": 15584456256,
|
| 10584 |
+
"step": 59450
|
| 10585 |
+
},
|
| 10586 |
+
{
|
| 10587 |
+
"epoch": 0.2838164971320224,
|
| 10588 |
+
"grad_norm": 0.21545729041099548,
|
| 10589 |
+
"learning_rate": 0.0008535533905932737,
|
| 10590 |
+
"loss": 2.5939,
|
| 10591 |
+
"num_input_tokens_seen": 15597563456,
|
| 10592 |
+
"step": 59500
|
| 10593 |
+
},
|
| 10594 |
+
{
|
| 10595 |
+
"epoch": 0.2838164971320224,
|
| 10596 |
+
"eval_loss": 2.469989538192749,
|
| 10597 |
+
"eval_runtime": 54.0784,
|
| 10598 |
+
"eval_samples_per_second": 92.458,
|
| 10599 |
+
"eval_steps_per_second": 23.115,
|
| 10600 |
+
"num_input_tokens_seen": 15597563456,
|
| 10601 |
+
"step": 59500
|
| 10602 |
+
},
|
| 10603 |
+
{
|
| 10604 |
+
"epoch": 0.2840549983901165,
|
| 10605 |
+
"grad_norm": 0.20836423337459564,
|
| 10606 |
+
"learning_rate": 0.0008495643602586287,
|
| 10607 |
+
"loss": 2.5858,
|
| 10608 |
+
"num_input_tokens_seen": 15610670656,
|
| 10609 |
+
"step": 59550
|
| 10610 |
+
},
|
| 10611 |
+
{
|
| 10612 |
+
"epoch": 0.28429349964821066,
|
| 10613 |
+
"grad_norm": 0.20427604019641876,
|
| 10614 |
+
"learning_rate": 0.0008455313244934324,
|
| 10615 |
+
"loss": 2.5781,
|
| 10616 |
+
"num_input_tokens_seen": 15623777856,
|
| 10617 |
+
"step": 59600
|
| 10618 |
+
},
|
| 10619 |
+
{
|
| 10620 |
+
"epoch": 0.28453200090630476,
|
| 10621 |
+
"grad_norm": 0.2341683804988861,
|
| 10622 |
+
"learning_rate": 0.0008414547910024035,
|
| 10623 |
+
"loss": 2.5713,
|
| 10624 |
+
"num_input_tokens_seen": 15636885056,
|
| 10625 |
+
"step": 59650
|
| 10626 |
+
},
|
| 10627 |
+
{
|
| 10628 |
+
"epoch": 0.2847705021643989,
|
| 10629 |
+
"grad_norm": 0.20808522403240204,
|
| 10630 |
+
"learning_rate": 0.0008373352729660373,
|
| 10631 |
+
"loss": 2.5751,
|
| 10632 |
+
"num_input_tokens_seen": 15649992256,
|
| 10633 |
+
"step": 59700
|
| 10634 |
+
},
|
| 10635 |
+
{
|
| 10636 |
+
"epoch": 0.2850090034224931,
|
| 10637 |
+
"grad_norm": 0.21032562851905823,
|
| 10638 |
+
"learning_rate": 0.000833173288976002,
|
| 10639 |
+
"loss": 2.5784,
|
| 10640 |
+
"num_input_tokens_seen": 15663099456,
|
| 10641 |
+
"step": 59750
|
| 10642 |
+
},
|
| 10643 |
+
{
|
| 10644 |
+
"epoch": 0.2852475046805872,
|
| 10645 |
+
"grad_norm": 0.23485584557056427,
|
| 10646 |
+
"learning_rate": 0.0008289693629698564,
|
| 10647 |
+
"loss": 2.5974,
|
| 10648 |
+
"num_input_tokens_seen": 15676206656,
|
| 10649 |
+
"step": 59800
|
| 10650 |
+
},
|
| 10651 |
+
{
|
| 10652 |
+
"epoch": 0.28548600593868134,
|
| 10653 |
+
"grad_norm": 0.2229880541563034,
|
| 10654 |
+
"learning_rate": 0.0008247240241650918,
|
| 10655 |
+
"loss": 2.5834,
|
| 10656 |
+
"num_input_tokens_seen": 15689313856,
|
| 10657 |
+
"step": 59850
|
| 10658 |
+
},
|
| 10659 |
+
{
|
| 10660 |
+
"epoch": 0.28572450719677545,
|
| 10661 |
+
"grad_norm": 0.21837118268013,
|
| 10662 |
+
"learning_rate": 0.000820437806992512,
|
| 10663 |
+
"loss": 2.5734,
|
| 10664 |
+
"num_input_tokens_seen": 15702421056,
|
| 10665 |
+
"step": 59900
|
| 10666 |
+
},
|
| 10667 |
+
{
|
| 10668 |
+
"epoch": 0.2859630084548696,
|
| 10669 |
+
"grad_norm": 0.2157929688692093,
|
| 10670 |
+
"learning_rate": 0.0008161112510289549,
|
| 10671 |
+
"loss": 2.587,
|
| 10672 |
+
"num_input_tokens_seen": 15715528256,
|
| 10673 |
+
"step": 59950
|
| 10674 |
+
},
|
| 10675 |
+
{
|
| 10676 |
+
"epoch": 0.2862015097129637,
|
| 10677 |
+
"grad_norm": 0.24053893983364105,
|
| 10678 |
+
"learning_rate": 0.0008117449009293668,
|
| 10679 |
+
"loss": 2.5853,
|
| 10680 |
+
"num_input_tokens_seen": 15728635456,
|
| 10681 |
+
"step": 60000
|
| 10682 |
+
},
|
| 10683 |
+
{
|
| 10684 |
+
"epoch": 0.2862015097129637,
|
| 10685 |
+
"eval_loss": 2.470459461212158,
|
| 10686 |
+
"eval_runtime": 53.5859,
|
| 10687 |
+
"eval_samples_per_second": 93.308,
|
| 10688 |
+
"eval_steps_per_second": 23.327,
|
| 10689 |
+
"num_input_tokens_seen": 15728635456,
|
| 10690 |
+
"step": 60000
|
| 10691 |
}
|
| 10692 |
],
|
| 10693 |
"logging_steps": 50,
|
| 10694 |
"max_steps": 70000,
|
| 10695 |
+
"num_input_tokens_seen": 15728635456,
|
| 10696 |
"num_train_epochs": 1,
|
| 10697 |
"save_steps": 1000,
|
| 10698 |
"stateful_callbacks": {
|
|
|
|
| 10707 |
"attributes": {}
|
| 10708 |
}
|
| 10709 |
},
|
| 10710 |
+
"total_flos": 4.2075634959620506e+18,
|
| 10711 |
"train_batch_size": 64,
|
| 10712 |
"trial_name": null,
|
| 10713 |
"trial_params": null
|