Training in progress, step 570000
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scaler.pt +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +303 -3
- pytorch_model.bin +1 -1
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 586828837
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fffdc57df6cba4aecc5d537199d05d28768deaf925b41240f122bcbc526d6c4d
|
| 3 |
size 586828837
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 146774203
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc99a917e1b327405a8f3c276c96d3252b44e706de05260c86fdfb67a8ea2ba1
|
| 3 |
size 146774203
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14503
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9eb8cd28c207e550a8e102ab438e79bd35b1834dd9eb8b97b0c0f9aab456235f
|
| 3 |
size 14503
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 559
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8cfba3a731feb83ca65973baf77fc04cbf64fea750132892e69c52d95de7113
|
| 3 |
size 559
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 733555848
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2425d50b58bd1bce863056a07fbed7929c2c0bfeef559ef18326c302aae672a
|
| 3 |
size 733555848
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 2.
|
| 5 |
-
"global_step":
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
@@ -16806,11 +16806,311 @@
|
|
| 16806 |
"learning_rate": 0.006970335826120932,
|
| 16807 |
"loss": 8.046,
|
| 16808 |
"step": 560000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16809 |
}
|
| 16810 |
],
|
| 16811 |
"max_steps": 1000000,
|
| 16812 |
"num_train_epochs": 5,
|
| 16813 |
-
"total_flos":
|
| 16814 |
"trial_name": null,
|
| 16815 |
"trial_params": null
|
| 16816 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 2.4494321185009476,
|
| 5 |
+
"global_step": 570000,
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
|
|
| 16806 |
"learning_rate": 0.006970335826120932,
|
| 16807 |
"loss": 8.046,
|
| 16808 |
"step": 560000
|
| 16809 |
+
},
|
| 16810 |
+
{
|
| 16811 |
+
"epoch": 2.41,
|
| 16812 |
+
"learning_rate": 0.006966210216011318,
|
| 16813 |
+
"loss": 8.0551,
|
| 16814 |
+
"step": 560200
|
| 16815 |
+
},
|
| 16816 |
+
{
|
| 16817 |
+
"epoch": 2.41,
|
| 16818 |
+
"learning_rate": 0.0069620830217777575,
|
| 16819 |
+
"loss": 8.0548,
|
| 16820 |
+
"step": 560400
|
| 16821 |
+
},
|
| 16822 |
+
{
|
| 16823 |
+
"epoch": 2.41,
|
| 16824 |
+
"learning_rate": 0.006957954246745461,
|
| 16825 |
+
"loss": 8.0485,
|
| 16826 |
+
"step": 560600
|
| 16827 |
+
},
|
| 16828 |
+
{
|
| 16829 |
+
"epoch": 2.41,
|
| 16830 |
+
"learning_rate": 0.006953823894240906,
|
| 16831 |
+
"loss": 8.0603,
|
| 16832 |
+
"step": 560800
|
| 16833 |
+
},
|
| 16834 |
+
{
|
| 16835 |
+
"epoch": 2.41,
|
| 16836 |
+
"learning_rate": 0.0069496919675918435,
|
| 16837 |
+
"loss": 8.051,
|
| 16838 |
+
"step": 561000
|
| 16839 |
+
},
|
| 16840 |
+
{
|
| 16841 |
+
"epoch": 2.41,
|
| 16842 |
+
"learning_rate": 0.006945558470127292,
|
| 16843 |
+
"loss": 8.0569,
|
| 16844 |
+
"step": 561200
|
| 16845 |
+
},
|
| 16846 |
+
{
|
| 16847 |
+
"epoch": 2.41,
|
| 16848 |
+
"learning_rate": 0.006941423405177537,
|
| 16849 |
+
"loss": 8.0389,
|
| 16850 |
+
"step": 561400
|
| 16851 |
+
},
|
| 16852 |
+
{
|
| 16853 |
+
"epoch": 2.41,
|
| 16854 |
+
"learning_rate": 0.0069372867760741225,
|
| 16855 |
+
"loss": 8.0413,
|
| 16856 |
+
"step": 561600
|
| 16857 |
+
},
|
| 16858 |
+
{
|
| 16859 |
+
"epoch": 2.41,
|
| 16860 |
+
"learning_rate": 0.006933148586149858,
|
| 16861 |
+
"loss": 8.0455,
|
| 16862 |
+
"step": 561800
|
| 16863 |
+
},
|
| 16864 |
+
{
|
| 16865 |
+
"epoch": 2.42,
|
| 16866 |
+
"learning_rate": 0.006929008838738809,
|
| 16867 |
+
"loss": 8.0532,
|
| 16868 |
+
"step": 562000
|
| 16869 |
+
},
|
| 16870 |
+
{
|
| 16871 |
+
"epoch": 2.42,
|
| 16872 |
+
"learning_rate": 0.006924867537176294,
|
| 16873 |
+
"loss": 8.0524,
|
| 16874 |
+
"step": 562200
|
| 16875 |
+
},
|
| 16876 |
+
{
|
| 16877 |
+
"epoch": 2.42,
|
| 16878 |
+
"learning_rate": 0.006920724684798886,
|
| 16879 |
+
"loss": 8.0644,
|
| 16880 |
+
"step": 562400
|
| 16881 |
+
},
|
| 16882 |
+
{
|
| 16883 |
+
"epoch": 2.42,
|
| 16884 |
+
"learning_rate": 0.006916580284944404,
|
| 16885 |
+
"loss": 8.0491,
|
| 16886 |
+
"step": 562600
|
| 16887 |
+
},
|
| 16888 |
+
{
|
| 16889 |
+
"epoch": 2.42,
|
| 16890 |
+
"learning_rate": 0.006912434340951918,
|
| 16891 |
+
"loss": 8.0658,
|
| 16892 |
+
"step": 562800
|
| 16893 |
+
},
|
| 16894 |
+
{
|
| 16895 |
+
"epoch": 2.42,
|
| 16896 |
+
"learning_rate": 0.006908286856161741,
|
| 16897 |
+
"loss": 8.0541,
|
| 16898 |
+
"step": 563000
|
| 16899 |
+
},
|
| 16900 |
+
{
|
| 16901 |
+
"epoch": 2.42,
|
| 16902 |
+
"learning_rate": 0.006904137833915425,
|
| 16903 |
+
"loss": 8.0429,
|
| 16904 |
+
"step": 563200
|
| 16905 |
+
},
|
| 16906 |
+
{
|
| 16907 |
+
"epoch": 2.42,
|
| 16908 |
+
"learning_rate": 0.006900008034148137,
|
| 16909 |
+
"loss": 8.047,
|
| 16910 |
+
"step": 563400
|
| 16911 |
+
},
|
| 16912 |
+
{
|
| 16913 |
+
"epoch": 2.42,
|
| 16914 |
+
"learning_rate": 0.006895855954664682,
|
| 16915 |
+
"loss": 8.0523,
|
| 16916 |
+
"step": 563600
|
| 16917 |
+
},
|
| 16918 |
+
{
|
| 16919 |
+
"epoch": 2.42,
|
| 16920 |
+
"learning_rate": 0.006891702347740443,
|
| 16921 |
+
"loss": 8.0611,
|
| 16922 |
+
"step": 563800
|
| 16923 |
+
},
|
| 16924 |
+
{
|
| 16925 |
+
"epoch": 2.42,
|
| 16926 |
+
"learning_rate": 0.0068875472167219025,
|
| 16927 |
+
"loss": 8.0624,
|
| 16928 |
+
"step": 564000
|
| 16929 |
+
},
|
| 16930 |
+
{
|
| 16931 |
+
"epoch": 2.42,
|
| 16932 |
+
"learning_rate": 0.006883390564956777,
|
| 16933 |
+
"loss": 8.0618,
|
| 16934 |
+
"step": 564200
|
| 16935 |
+
},
|
| 16936 |
+
{
|
| 16937 |
+
"epoch": 2.43,
|
| 16938 |
+
"learning_rate": 0.006879232395794005,
|
| 16939 |
+
"loss": 8.0637,
|
| 16940 |
+
"step": 564400
|
| 16941 |
+
},
|
| 16942 |
+
{
|
| 16943 |
+
"epoch": 2.43,
|
| 16944 |
+
"learning_rate": 0.006875072712583748,
|
| 16945 |
+
"loss": 8.0495,
|
| 16946 |
+
"step": 564600
|
| 16947 |
+
},
|
| 16948 |
+
{
|
| 16949 |
+
"epoch": 2.43,
|
| 16950 |
+
"learning_rate": 0.00687091151867739,
|
| 16951 |
+
"loss": 8.0603,
|
| 16952 |
+
"step": 564800
|
| 16953 |
+
},
|
| 16954 |
+
{
|
| 16955 |
+
"epoch": 2.43,
|
| 16956 |
+
"learning_rate": 0.006866748817427526,
|
| 16957 |
+
"loss": 8.0579,
|
| 16958 |
+
"step": 565000
|
| 16959 |
+
},
|
| 16960 |
+
{
|
| 16961 |
+
"epoch": 2.43,
|
| 16962 |
+
"learning_rate": 0.006862584612187971,
|
| 16963 |
+
"loss": 8.0629,
|
| 16964 |
+
"step": 565200
|
| 16965 |
+
},
|
| 16966 |
+
{
|
| 16967 |
+
"epoch": 2.43,
|
| 16968 |
+
"learning_rate": 0.006858439738570398,
|
| 16969 |
+
"loss": 8.0601,
|
| 16970 |
+
"step": 565400
|
| 16971 |
+
},
|
| 16972 |
+
{
|
| 16973 |
+
"epoch": 2.43,
|
| 16974 |
+
"learning_rate": 0.006854293382593129,
|
| 16975 |
+
"loss": 8.0478,
|
| 16976 |
+
"step": 565600
|
| 16977 |
+
},
|
| 16978 |
+
{
|
| 16979 |
+
"epoch": 2.43,
|
| 16980 |
+
"learning_rate": 0.00685012470044207,
|
| 16981 |
+
"loss": 8.06,
|
| 16982 |
+
"step": 565800
|
| 16983 |
+
},
|
| 16984 |
+
{
|
| 16985 |
+
"epoch": 2.43,
|
| 16986 |
+
"learning_rate": 0.006845954527695071,
|
| 16987 |
+
"loss": 8.0508,
|
| 16988 |
+
"step": 566000
|
| 16989 |
+
},
|
| 16990 |
+
{
|
| 16991 |
+
"epoch": 2.43,
|
| 16992 |
+
"learning_rate": 0.006841782867711967,
|
| 16993 |
+
"loss": 8.0748,
|
| 16994 |
+
"step": 566200
|
| 16995 |
+
},
|
| 16996 |
+
{
|
| 16997 |
+
"epoch": 2.43,
|
| 16998 |
+
"learning_rate": 0.006837609723853784,
|
| 16999 |
+
"loss": 8.0635,
|
| 17000 |
+
"step": 566400
|
| 17001 |
+
},
|
| 17002 |
+
{
|
| 17003 |
+
"epoch": 2.43,
|
| 17004 |
+
"learning_rate": 0.0068334350994827524,
|
| 17005 |
+
"loss": 8.0627,
|
| 17006 |
+
"step": 566600
|
| 17007 |
+
},
|
| 17008 |
+
{
|
| 17009 |
+
"epoch": 2.44,
|
| 17010 |
+
"learning_rate": 0.0068292589979622904,
|
| 17011 |
+
"loss": 8.0511,
|
| 17012 |
+
"step": 566800
|
| 17013 |
+
},
|
| 17014 |
+
{
|
| 17015 |
+
"epoch": 2.44,
|
| 17016 |
+
"learning_rate": 0.006825081422657008,
|
| 17017 |
+
"loss": 8.0495,
|
| 17018 |
+
"step": 567000
|
| 17019 |
+
},
|
| 17020 |
+
{
|
| 17021 |
+
"epoch": 2.44,
|
| 17022 |
+
"learning_rate": 0.0068209023769327005,
|
| 17023 |
+
"loss": 8.0555,
|
| 17024 |
+
"step": 567200
|
| 17025 |
+
},
|
| 17026 |
+
{
|
| 17027 |
+
"epoch": 2.44,
|
| 17028 |
+
"learning_rate": 0.006816721864156354,
|
| 17029 |
+
"loss": 8.0548,
|
| 17030 |
+
"step": 567400
|
| 17031 |
+
},
|
| 17032 |
+
{
|
| 17033 |
+
"epoch": 2.44,
|
| 17034 |
+
"learning_rate": 0.006812539887696127,
|
| 17035 |
+
"loss": 8.0487,
|
| 17036 |
+
"step": 567600
|
| 17037 |
+
},
|
| 17038 |
+
{
|
| 17039 |
+
"epoch": 2.44,
|
| 17040 |
+
"learning_rate": 0.006808356450921365,
|
| 17041 |
+
"loss": 8.0457,
|
| 17042 |
+
"step": 567800
|
| 17043 |
+
},
|
| 17044 |
+
{
|
| 17045 |
+
"epoch": 2.44,
|
| 17046 |
+
"learning_rate": 0.0068041715572025865,
|
| 17047 |
+
"loss": 8.0417,
|
| 17048 |
+
"step": 568000
|
| 17049 |
+
},
|
| 17050 |
+
{
|
| 17051 |
+
"epoch": 2.44,
|
| 17052 |
+
"learning_rate": 0.006799985209911487,
|
| 17053 |
+
"loss": 8.0564,
|
| 17054 |
+
"step": 568200
|
| 17055 |
+
},
|
| 17056 |
+
{
|
| 17057 |
+
"epoch": 2.44,
|
| 17058 |
+
"learning_rate": 0.0067957974124209265,
|
| 17059 |
+
"loss": 8.0481,
|
| 17060 |
+
"step": 568400
|
| 17061 |
+
},
|
| 17062 |
+
{
|
| 17063 |
+
"epoch": 2.44,
|
| 17064 |
+
"learning_rate": 0.0067916081681049425,
|
| 17065 |
+
"loss": 8.0318,
|
| 17066 |
+
"step": 568600
|
| 17067 |
+
},
|
| 17068 |
+
{
|
| 17069 |
+
"epoch": 2.44,
|
| 17070 |
+
"learning_rate": 0.00678741748033873,
|
| 17071 |
+
"loss": 8.0717,
|
| 17072 |
+
"step": 568800
|
| 17073 |
+
},
|
| 17074 |
+
{
|
| 17075 |
+
"epoch": 2.45,
|
| 17076 |
+
"learning_rate": 0.006783225352498653,
|
| 17077 |
+
"loss": 8.0506,
|
| 17078 |
+
"step": 569000
|
| 17079 |
+
},
|
| 17080 |
+
{
|
| 17081 |
+
"epoch": 2.45,
|
| 17082 |
+
"learning_rate": 0.0067790317879622315,
|
| 17083 |
+
"loss": 8.0453,
|
| 17084 |
+
"step": 569200
|
| 17085 |
+
},
|
| 17086 |
+
{
|
| 17087 |
+
"epoch": 2.45,
|
| 17088 |
+
"learning_rate": 0.006774836790108145,
|
| 17089 |
+
"loss": 8.0478,
|
| 17090 |
+
"step": 569400
|
| 17091 |
+
},
|
| 17092 |
+
{
|
| 17093 |
+
"epoch": 2.45,
|
| 17094 |
+
"learning_rate": 0.006770661348006565,
|
| 17095 |
+
"loss": 8.0587,
|
| 17096 |
+
"step": 569600
|
| 17097 |
+
},
|
| 17098 |
+
{
|
| 17099 |
+
"epoch": 2.45,
|
| 17100 |
+
"learning_rate": 0.006766463500782177,
|
| 17101 |
+
"loss": 8.0524,
|
| 17102 |
+
"step": 569800
|
| 17103 |
+
},
|
| 17104 |
+
{
|
| 17105 |
+
"epoch": 2.45,
|
| 17106 |
+
"learning_rate": 0.006762285230252838,
|
| 17107 |
+
"loss": 8.0655,
|
| 17108 |
+
"step": 570000
|
| 17109 |
}
|
| 17110 |
],
|
| 17111 |
"max_steps": 1000000,
|
| 17112 |
"num_train_epochs": 5,
|
| 17113 |
+
"total_flos": 9.084816952573256e+17,
|
| 17114 |
"trial_name": null,
|
| 17115 |
"trial_params": null
|
| 17116 |
}
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 146774203
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc99a917e1b327405a8f3c276c96d3252b44e706de05260c86fdfb67a8ea2ba1
|
| 3 |
size 146774203
|