Training in progress, step 61000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4dc1817a301fc24319ca1c05c92090e28d0ab00a3a5d43949da4772ff52fcf2b
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c0576570955fbd0c77602fddc48b3da384f1445f3f7054045594138138a2617
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8ee7735caca4437694ef1fa1c7821cadab81eb5dba9c8318224d8baee7f9384
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88c7ed774bb0bea4c8451805c5254d2a8728348d14f02b8481173830b417e9b0
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10688,11 +10688,189 @@
|
|
| 10688 |
"eval_steps_per_second": 23.327,
|
| 10689 |
"num_input_tokens_seen": 15728635456,
|
| 10690 |
"step": 60000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10691 |
}
|
| 10692 |
],
|
| 10693 |
"logging_steps": 50,
|
| 10694 |
"max_steps": 70000,
|
| 10695 |
-
"num_input_tokens_seen":
|
| 10696 |
"num_train_epochs": 1,
|
| 10697 |
"save_steps": 1000,
|
| 10698 |
"stateful_callbacks": {
|
|
@@ -10707,7 +10885,7 @@
|
|
| 10707 |
"attributes": {}
|
| 10708 |
}
|
| 10709 |
},
|
| 10710 |
-
"total_flos": 4.
|
| 10711 |
"train_batch_size": 64,
|
| 10712 |
"trial_name": null,
|
| 10713 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.29097153487484645,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 61000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10688 |
"eval_steps_per_second": 23.327,
|
| 10689 |
"num_input_tokens_seen": 15728635456,
|
| 10690 |
"step": 60000
|
| 10691 |
+
},
|
| 10692 |
+
{
|
| 10693 |
+
"epoch": 0.28644001097105787,
|
| 10694 |
+
"grad_norm": 0.25951045751571655,
|
| 10695 |
+
"learning_rate": 0.0008073393063582386,
|
| 10696 |
+
"loss": 2.5946,
|
| 10697 |
+
"num_input_tokens_seen": 15741742656,
|
| 10698 |
+
"step": 60050
|
| 10699 |
+
},
|
| 10700 |
+
{
|
| 10701 |
+
"epoch": 0.286678512229152,
|
| 10702 |
+
"grad_norm": 0.22712726891040802,
|
| 10703 |
+
"learning_rate": 0.00080289502192041,
|
| 10704 |
+
"loss": 2.5882,
|
| 10705 |
+
"num_input_tokens_seen": 15754849856,
|
| 10706 |
+
"step": 60100
|
| 10707 |
+
},
|
| 10708 |
+
{
|
| 10709 |
+
"epoch": 0.28691701348724613,
|
| 10710 |
+
"grad_norm": 0.2236946076154709,
|
| 10711 |
+
"learning_rate": 0.0007984126070912518,
|
| 10712 |
+
"loss": 2.5854,
|
| 10713 |
+
"num_input_tokens_seen": 15767957056,
|
| 10714 |
+
"step": 60150
|
| 10715 |
+
},
|
| 10716 |
+
{
|
| 10717 |
+
"epoch": 0.2871555147453403,
|
| 10718 |
+
"grad_norm": 0.3175867795944214,
|
| 10719 |
+
"learning_rate": 0.0007938926261462366,
|
| 10720 |
+
"loss": 2.5855,
|
| 10721 |
+
"num_input_tokens_seen": 15781064256,
|
| 10722 |
+
"step": 60200
|
| 10723 |
+
},
|
| 10724 |
+
{
|
| 10725 |
+
"epoch": 0.2873940160034344,
|
| 10726 |
+
"grad_norm": 0.22954128682613373,
|
| 10727 |
+
"learning_rate": 0.000789335648089903,
|
| 10728 |
+
"loss": 2.595,
|
| 10729 |
+
"num_input_tokens_seen": 15794171456,
|
| 10730 |
+
"step": 60250
|
| 10731 |
+
},
|
| 10732 |
+
{
|
| 10733 |
+
"epoch": 0.28763251726152855,
|
| 10734 |
+
"grad_norm": 0.23379147052764893,
|
| 10735 |
+
"learning_rate": 0.000784742246584226,
|
| 10736 |
+
"loss": 2.5872,
|
| 10737 |
+
"num_input_tokens_seen": 15807278656,
|
| 10738 |
+
"step": 60300
|
| 10739 |
+
},
|
| 10740 |
+
{
|
| 10741 |
+
"epoch": 0.2878710185196227,
|
| 10742 |
+
"grad_norm": 0.22107115387916565,
|
| 10743 |
+
"learning_rate": 0.0007801129998764014,
|
| 10744 |
+
"loss": 2.5704,
|
| 10745 |
+
"num_input_tokens_seen": 15820385856,
|
| 10746 |
+
"step": 60350
|
| 10747 |
+
},
|
| 10748 |
+
{
|
| 10749 |
+
"epoch": 0.2881095197777168,
|
| 10750 |
+
"grad_norm": 0.21197494864463806,
|
| 10751 |
+
"learning_rate": 0.0007754484907260512,
|
| 10752 |
+
"loss": 2.5751,
|
| 10753 |
+
"num_input_tokens_seen": 15833493056,
|
| 10754 |
+
"step": 60400
|
| 10755 |
+
},
|
| 10756 |
+
{
|
| 10757 |
+
"epoch": 0.288348021035811,
|
| 10758 |
+
"grad_norm": 0.21372662484645844,
|
| 10759 |
+
"learning_rate": 0.0007707493063318629,
|
| 10760 |
+
"loss": 2.5901,
|
| 10761 |
+
"num_input_tokens_seen": 15846600256,
|
| 10762 |
+
"step": 60450
|
| 10763 |
+
},
|
| 10764 |
+
{
|
| 10765 |
+
"epoch": 0.2885865222939051,
|
| 10766 |
+
"grad_norm": 0.23300603032112122,
|
| 10767 |
+
"learning_rate": 0.0007660160382576683,
|
| 10768 |
+
"loss": 2.5888,
|
| 10769 |
+
"num_input_tokens_seen": 15859707456,
|
| 10770 |
+
"step": 60500
|
| 10771 |
+
},
|
| 10772 |
+
{
|
| 10773 |
+
"epoch": 0.2885865222939051,
|
| 10774 |
+
"eval_loss": 2.463745355606079,
|
| 10775 |
+
"eval_runtime": 53.032,
|
| 10776 |
+
"eval_samples_per_second": 94.283,
|
| 10777 |
+
"eval_steps_per_second": 23.571,
|
| 10778 |
+
"num_input_tokens_seen": 15859707456,
|
| 10779 |
+
"step": 60500
|
| 10780 |
+
},
|
| 10781 |
+
{
|
| 10782 |
+
"epoch": 0.28882502355199924,
|
| 10783 |
+
"grad_norm": 0.2108684778213501,
|
| 10784 |
+
"learning_rate": 0.0007612492823579744,
|
| 10785 |
+
"loss": 2.5965,
|
| 10786 |
+
"num_input_tokens_seen": 15872814656,
|
| 10787 |
+
"step": 60550
|
| 10788 |
+
},
|
| 10789 |
+
{
|
| 10790 |
+
"epoch": 0.2890635248100934,
|
| 10791 |
+
"grad_norm": 0.20625820755958557,
|
| 10792 |
+
"learning_rate": 0.0007564496387029531,
|
| 10793 |
+
"loss": 2.5615,
|
| 10794 |
+
"num_input_tokens_seen": 15885921856,
|
| 10795 |
+
"step": 60600
|
| 10796 |
+
},
|
| 10797 |
+
{
|
| 10798 |
+
"epoch": 0.2893020260681875,
|
| 10799 |
+
"grad_norm": 0.22595694661140442,
|
| 10800 |
+
"learning_rate": 0.0007516177115029001,
|
| 10801 |
+
"loss": 2.5871,
|
| 10802 |
+
"num_input_tokens_seen": 15899029056,
|
| 10803 |
+
"step": 60650
|
| 10804 |
+
},
|
| 10805 |
+
{
|
| 10806 |
+
"epoch": 0.28954052732628166,
|
| 10807 |
+
"grad_norm": 0.2095574140548706,
|
| 10808 |
+
"learning_rate": 0.0007467541090321735,
|
| 10809 |
+
"loss": 2.5867,
|
| 10810 |
+
"num_input_tokens_seen": 15912136256,
|
| 10811 |
+
"step": 60700
|
| 10812 |
+
},
|
| 10813 |
+
{
|
| 10814 |
+
"epoch": 0.28977902858437576,
|
| 10815 |
+
"grad_norm": 0.1979990303516388,
|
| 10816 |
+
"learning_rate": 0.00074185944355262,
|
| 10817 |
+
"loss": 2.586,
|
| 10818 |
+
"num_input_tokens_seen": 15925243456,
|
| 10819 |
+
"step": 60750
|
| 10820 |
+
},
|
| 10821 |
+
{
|
| 10822 |
+
"epoch": 0.2900175298424699,
|
| 10823 |
+
"grad_norm": 0.3573000431060791,
|
| 10824 |
+
"learning_rate": 0.0007369343312364993,
|
| 10825 |
+
"loss": 2.5807,
|
| 10826 |
+
"num_input_tokens_seen": 15938350656,
|
| 10827 |
+
"step": 60800
|
| 10828 |
+
},
|
| 10829 |
+
{
|
| 10830 |
+
"epoch": 0.2902560311005641,
|
| 10831 |
+
"grad_norm": 0.2209523618221283,
|
| 10832 |
+
"learning_rate": 0.0007319793920889171,
|
| 10833 |
+
"loss": 2.5867,
|
| 10834 |
+
"num_input_tokens_seen": 15951457856,
|
| 10835 |
+
"step": 60850
|
| 10836 |
+
},
|
| 10837 |
+
{
|
| 10838 |
+
"epoch": 0.2904945323586582,
|
| 10839 |
+
"grad_norm": 0.1979866325855255,
|
| 10840 |
+
"learning_rate": 0.0007269952498697733,
|
| 10841 |
+
"loss": 2.5679,
|
| 10842 |
+
"num_input_tokens_seen": 15964565056,
|
| 10843 |
+
"step": 60900
|
| 10844 |
+
},
|
| 10845 |
+
{
|
| 10846 |
+
"epoch": 0.29073303361675235,
|
| 10847 |
+
"grad_norm": 0.2013344019651413,
|
| 10848 |
+
"learning_rate": 0.0007219825320152411,
|
| 10849 |
+
"loss": 2.5842,
|
| 10850 |
+
"num_input_tokens_seen": 15977672256,
|
| 10851 |
+
"step": 60950
|
| 10852 |
+
},
|
| 10853 |
+
{
|
| 10854 |
+
"epoch": 0.29097153487484645,
|
| 10855 |
+
"grad_norm": 0.20511233806610107,
|
| 10856 |
+
"learning_rate": 0.0007169418695587791,
|
| 10857 |
+
"loss": 2.5864,
|
| 10858 |
+
"num_input_tokens_seen": 15990779456,
|
| 10859 |
+
"step": 61000
|
| 10860 |
+
},
|
| 10861 |
+
{
|
| 10862 |
+
"epoch": 0.29097153487484645,
|
| 10863 |
+
"eval_loss": 2.4598097801208496,
|
| 10864 |
+
"eval_runtime": 53.5493,
|
| 10865 |
+
"eval_samples_per_second": 93.372,
|
| 10866 |
+
"eval_steps_per_second": 23.343,
|
| 10867 |
+
"num_input_tokens_seen": 15990779456,
|
| 10868 |
+
"step": 61000
|
| 10869 |
}
|
| 10870 |
],
|
| 10871 |
"logging_steps": 50,
|
| 10872 |
"max_steps": 70000,
|
| 10873 |
+
"num_input_tokens_seen": 15990779456,
|
| 10874 |
"num_train_epochs": 1,
|
| 10875 |
"save_steps": 1000,
|
| 10876 |
"stateful_callbacks": {
|
|
|
|
| 10885 |
"attributes": {}
|
| 10886 |
}
|
| 10887 |
},
|
| 10888 |
+
"total_flos": 4.2776895744874906e+18,
|
| 10889 |
"train_batch_size": 64,
|
| 10890 |
"trial_name": null,
|
| 10891 |
"trial_params": null
|