Training in progress, step 55000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:62e4ec5f596aeddac39f75a6501f66ecd7eb297d85fd39f281237c384adec887
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6aa82a32a09e79af011cf35188194304359148308b76399c6d5815593f337709
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a5eacfa99e53a8a1de73851121ef39f03223e9cc67398ac06a0e84e6dbf4ae3
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5df6e1f8ed049732a2e5d49c46b32207c644d0cb43e6b3e615ea32a67128cbab
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9620,11 +9620,189 @@
|
|
| 9620 |
"eval_steps_per_second": 23.494,
|
| 9621 |
"num_input_tokens_seen": 14155776000,
|
| 9622 |
"step": 54000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9623 |
}
|
| 9624 |
],
|
| 9625 |
"logging_steps": 50,
|
| 9626 |
"max_steps": 60000,
|
| 9627 |
-
"num_input_tokens_seen":
|
| 9628 |
"num_train_epochs": 1,
|
| 9629 |
"save_steps": 1000,
|
| 9630 |
"stateful_callbacks": {
|
|
@@ -9639,7 +9817,7 @@
|
|
| 9639 |
"attributes": {}
|
| 9640 |
}
|
| 9641 |
},
|
| 9642 |
-
"total_flos": 3.
|
| 9643 |
"train_batch_size": 64,
|
| 9644 |
"trial_name": null,
|
| 9645 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3699574380783738,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 55000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9620 |
"eval_steps_per_second": 23.494,
|
| 9621 |
"num_input_tokens_seen": 14155776000,
|
| 9622 |
"step": 54000
|
| 9623 |
+
},
|
| 9624 |
+
{
|
| 9625 |
+
"epoch": 0.36356726414792917,
|
| 9626 |
+
"grad_norm": 0.2197147160768509,
|
| 9627 |
+
"learning_rate": 0.0009998286624877785,
|
| 9628 |
+
"loss": 3.0502,
|
| 9629 |
+
"num_input_tokens_seen": 14168883200,
|
| 9630 |
+
"step": 54050
|
| 9631 |
+
},
|
| 9632 |
+
{
|
| 9633 |
+
"epoch": 0.3639035890916368,
|
| 9634 |
+
"grad_norm": 0.22259306907653809,
|
| 9635 |
+
"learning_rate": 0.0009993147673772868,
|
| 9636 |
+
"loss": 3.0433,
|
| 9637 |
+
"num_input_tokens_seen": 14181990400,
|
| 9638 |
+
"step": 54100
|
| 9639 |
+
},
|
| 9640 |
+
{
|
| 9641 |
+
"epoch": 0.3642399140353444,
|
| 9642 |
+
"grad_norm": 0.19341766834259033,
|
| 9643 |
+
"learning_rate": 0.000998458666866564,
|
| 9644 |
+
"loss": 3.0486,
|
| 9645 |
+
"num_input_tokens_seen": 14195097600,
|
| 9646 |
+
"step": 54150
|
| 9647 |
+
},
|
| 9648 |
+
{
|
| 9649 |
+
"epoch": 0.364576238979052,
|
| 9650 |
+
"grad_norm": 0.2313617616891861,
|
| 9651 |
+
"learning_rate": 0.0009972609476841367,
|
| 9652 |
+
"loss": 3.0446,
|
| 9653 |
+
"num_input_tokens_seen": 14208204800,
|
| 9654 |
+
"step": 54200
|
| 9655 |
+
},
|
| 9656 |
+
{
|
| 9657 |
+
"epoch": 0.3649125639227596,
|
| 9658 |
+
"grad_norm": 0.1925128698348999,
|
| 9659 |
+
"learning_rate": 0.0009957224306869053,
|
| 9660 |
+
"loss": 3.0528,
|
| 9661 |
+
"num_input_tokens_seen": 14221312000,
|
| 9662 |
+
"step": 54250
|
| 9663 |
+
},
|
| 9664 |
+
{
|
| 9665 |
+
"epoch": 0.3652488888664672,
|
| 9666 |
+
"grad_norm": 0.2100643515586853,
|
| 9667 |
+
"learning_rate": 0.0009938441702975688,
|
| 9668 |
+
"loss": 3.0453,
|
| 9669 |
+
"num_input_tokens_seen": 14234419200,
|
| 9670 |
+
"step": 54300
|
| 9671 |
+
},
|
| 9672 |
+
{
|
| 9673 |
+
"epoch": 0.36558521381017484,
|
| 9674 |
+
"grad_norm": 0.46658360958099365,
|
| 9675 |
+
"learning_rate": 0.0009916274537819774,
|
| 9676 |
+
"loss": 3.0464,
|
| 9677 |
+
"num_input_tokens_seen": 14247526400,
|
| 9678 |
+
"step": 54350
|
| 9679 |
+
},
|
| 9680 |
+
{
|
| 9681 |
+
"epoch": 0.36592153875388245,
|
| 9682 |
+
"grad_norm": 0.19623732566833496,
|
| 9683 |
+
"learning_rate": 0.0009890738003669028,
|
| 9684 |
+
"loss": 3.0427,
|
| 9685 |
+
"num_input_tokens_seen": 14260633600,
|
| 9686 |
+
"step": 54400
|
| 9687 |
+
},
|
| 9688 |
+
{
|
| 9689 |
+
"epoch": 0.36625786369759006,
|
| 9690 |
+
"grad_norm": 0.24941138923168182,
|
| 9691 |
+
"learning_rate": 0.0009861849601988384,
|
| 9692 |
+
"loss": 3.0528,
|
| 9693 |
+
"num_input_tokens_seen": 14273740800,
|
| 9694 |
+
"step": 54450
|
| 9695 |
+
},
|
| 9696 |
+
{
|
| 9697 |
+
"epoch": 0.36659418864129767,
|
| 9698 |
+
"grad_norm": 0.22141198813915253,
|
| 9699 |
+
"learning_rate": 0.0009829629131445341,
|
| 9700 |
+
"loss": 3.0523,
|
| 9701 |
+
"num_input_tokens_seen": 14286848000,
|
| 9702 |
+
"step": 54500
|
| 9703 |
+
},
|
| 9704 |
+
{
|
| 9705 |
+
"epoch": 0.36659418864129767,
|
| 9706 |
+
"eval_loss": 2.9419288635253906,
|
| 9707 |
+
"eval_runtime": 53.6937,
|
| 9708 |
+
"eval_samples_per_second": 93.121,
|
| 9709 |
+
"eval_steps_per_second": 23.28,
|
| 9710 |
+
"num_input_tokens_seen": 14286848000,
|
| 9711 |
+
"step": 54500
|
| 9712 |
+
},
|
| 9713 |
+
{
|
| 9714 |
+
"epoch": 0.3669305135850053,
|
| 9715 |
+
"grad_norm": 0.2028401494026184,
|
| 9716 |
+
"learning_rate": 0.0009794098674340967,
|
| 9717 |
+
"loss": 3.0403,
|
| 9718 |
+
"num_input_tokens_seen": 14299955200,
|
| 9719 |
+
"step": 54550
|
| 9720 |
+
},
|
| 9721 |
+
{
|
| 9722 |
+
"epoch": 0.3672668385287129,
|
| 9723 |
+
"grad_norm": 0.20509253442287445,
|
| 9724 |
+
"learning_rate": 0.0009755282581475768,
|
| 9725 |
+
"loss": 3.0543,
|
| 9726 |
+
"num_input_tokens_seen": 14313062400,
|
| 9727 |
+
"step": 54600
|
| 9728 |
+
},
|
| 9729 |
+
{
|
| 9730 |
+
"epoch": 0.3676031634724205,
|
| 9731 |
+
"grad_norm": 1.2793521881103516,
|
| 9732 |
+
"learning_rate": 0.0009713207455460893,
|
| 9733 |
+
"loss": 3.0718,
|
| 9734 |
+
"num_input_tokens_seen": 14326169600,
|
| 9735 |
+
"step": 54650
|
| 9736 |
+
},
|
| 9737 |
+
{
|
| 9738 |
+
"epoch": 0.3679394884161281,
|
| 9739 |
+
"grad_norm": 1.1210218667984009,
|
| 9740 |
+
"learning_rate": 0.0009667902132486009,
|
| 9741 |
+
"loss": 3.0706,
|
| 9742 |
+
"num_input_tokens_seen": 14339276800,
|
| 9743 |
+
"step": 54700
|
| 9744 |
+
},
|
| 9745 |
+
{
|
| 9746 |
+
"epoch": 0.3682758133598357,
|
| 9747 |
+
"grad_norm": 0.5492864847183228,
|
| 9748 |
+
"learning_rate": 0.0009619397662556434,
|
| 9749 |
+
"loss": 3.0793,
|
| 9750 |
+
"num_input_tokens_seen": 14352384000,
|
| 9751 |
+
"step": 54750
|
| 9752 |
+
},
|
| 9753 |
+
{
|
| 9754 |
+
"epoch": 0.36861213830354334,
|
| 9755 |
+
"grad_norm": 0.34732338786125183,
|
| 9756 |
+
"learning_rate": 0.0009567727288213005,
|
| 9757 |
+
"loss": 3.0662,
|
| 9758 |
+
"num_input_tokens_seen": 14365491200,
|
| 9759 |
+
"step": 54800
|
| 9760 |
+
},
|
| 9761 |
+
{
|
| 9762 |
+
"epoch": 0.36894846324725095,
|
| 9763 |
+
"grad_norm": 0.2698073983192444,
|
| 9764 |
+
"learning_rate": 0.0009512926421749304,
|
| 9765 |
+
"loss": 3.0682,
|
| 9766 |
+
"num_input_tokens_seen": 14378598400,
|
| 9767 |
+
"step": 54850
|
| 9768 |
+
},
|
| 9769 |
+
{
|
| 9770 |
+
"epoch": 0.36928478819095856,
|
| 9771 |
+
"grad_norm": 0.593543529510498,
|
| 9772 |
+
"learning_rate": 0.0009455032620941839,
|
| 9773 |
+
"loss": 3.0507,
|
| 9774 |
+
"num_input_tokens_seen": 14391705600,
|
| 9775 |
+
"step": 54900
|
| 9776 |
+
},
|
| 9777 |
+
{
|
| 9778 |
+
"epoch": 0.36962111313466617,
|
| 9779 |
+
"grad_norm": 0.28389155864715576,
|
| 9780 |
+
"learning_rate": 0.0009394085563309827,
|
| 9781 |
+
"loss": 3.0593,
|
| 9782 |
+
"num_input_tokens_seen": 14404812800,
|
| 9783 |
+
"step": 54950
|
| 9784 |
+
},
|
| 9785 |
+
{
|
| 9786 |
+
"epoch": 0.3699574380783738,
|
| 9787 |
+
"grad_norm": 0.2569947838783264,
|
| 9788 |
+
"learning_rate": 0.0009330127018922195,
|
| 9789 |
+
"loss": 3.0524,
|
| 9790 |
+
"num_input_tokens_seen": 14417920000,
|
| 9791 |
+
"step": 55000
|
| 9792 |
+
},
|
| 9793 |
+
{
|
| 9794 |
+
"epoch": 0.3699574380783738,
|
| 9795 |
+
"eval_loss": 2.9468750953674316,
|
| 9796 |
+
"eval_runtime": 52.9661,
|
| 9797 |
+
"eval_samples_per_second": 94.4,
|
| 9798 |
+
"eval_steps_per_second": 23.6,
|
| 9799 |
+
"num_input_tokens_seen": 14417920000,
|
| 9800 |
+
"step": 55000
|
| 9801 |
}
|
| 9802 |
],
|
| 9803 |
"logging_steps": 50,
|
| 9804 |
"max_steps": 60000,
|
| 9805 |
+
"num_input_tokens_seen": 14417920000,
|
| 9806 |
"num_train_epochs": 1,
|
| 9807 |
"save_steps": 1000,
|
| 9808 |
"stateful_callbacks": {
|
|
|
|
| 9817 |
"attributes": {}
|
| 9818 |
}
|
| 9819 |
},
|
| 9820 |
+
"total_flos": 3.8569343188992e+18,
|
| 9821 |
"train_batch_size": 64,
|
| 9822 |
"trial_name": null,
|
| 9823 |
"trial_params": null
|