Training in progress, step 29000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:746096b767c4c47c0c49b66fcf9e67e43d00132964e1a14503d0dc54e61a88ce
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ff72e1a59e706ba21c0c5fc5faf4ff560d04a9269b480a240031d2014cadf01
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46ed9fc518619ac92c06b536cae3d8dd21e3799906ab806f17d4dd1aa6e8dd9d
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0bcf7c080583def4d92e63cc47df57eaf4cf519a6a214957e6214d525864a6a
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbfa9291779333cc6de79bd13fa6c586039654ce156817d635f2b7564e084805
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2d36fdfb70bf9082281ebe37b706d22a6591594718aa46603291c3e49697116
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1cd39fe9272798d41cdbf7f22a06af7a14c62772e1b67733185e58a79e1dfc7e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9808,6 +9808,356 @@
|
|
| 9808 |
"learning_rate": 0.000491060032601183,
|
| 9809 |
"loss": 18.3958,
|
| 9810 |
"step": 28000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9811 |
}
|
| 9812 |
],
|
| 9813 |
"logging_steps": 20,
|
|
@@ -9827,7 +10177,7 @@
|
|
| 9827 |
"attributes": {}
|
| 9828 |
}
|
| 9829 |
},
|
| 9830 |
-
"total_flos": 2.
|
| 9831 |
"train_batch_size": 48,
|
| 9832 |
"trial_name": null,
|
| 9833 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.05654922951674783,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 29000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9808 |
"learning_rate": 0.000491060032601183,
|
| 9809 |
"loss": 18.3958,
|
| 9810 |
"step": 28000
|
| 9811 |
+
},
|
| 9812 |
+
{
|
| 9813 |
+
"epoch": 0.05463825555376808,
|
| 9814 |
+
"grad_norm": 8.4375,
|
| 9815 |
+
"learning_rate": 0.0004910535305702737,
|
| 9816 |
+
"loss": 18.3374,
|
| 9817 |
+
"step": 28020
|
| 9818 |
+
},
|
| 9819 |
+
{
|
| 9820 |
+
"epoch": 0.05467725502240032,
|
| 9821 |
+
"grad_norm": 8.4375,
|
| 9822 |
+
"learning_rate": 0.0004910470285393643,
|
| 9823 |
+
"loss": 18.3651,
|
| 9824 |
+
"step": 28040
|
| 9825 |
+
},
|
| 9826 |
+
{
|
| 9827 |
+
"epoch": 0.05471625449103256,
|
| 9828 |
+
"grad_norm": 7.9375,
|
| 9829 |
+
"learning_rate": 0.000491040526508455,
|
| 9830 |
+
"loss": 18.2815,
|
| 9831 |
+
"step": 28060
|
| 9832 |
+
},
|
| 9833 |
+
{
|
| 9834 |
+
"epoch": 0.0547552539596648,
|
| 9835 |
+
"grad_norm": 8.1875,
|
| 9836 |
+
"learning_rate": 0.0004910340244775455,
|
| 9837 |
+
"loss": 18.3387,
|
| 9838 |
+
"step": 28080
|
| 9839 |
+
},
|
| 9840 |
+
{
|
| 9841 |
+
"epoch": 0.05479425342829704,
|
| 9842 |
+
"grad_norm": 8.125,
|
| 9843 |
+
"learning_rate": 0.0004910275224466362,
|
| 9844 |
+
"loss": 18.4166,
|
| 9845 |
+
"step": 28100
|
| 9846 |
+
},
|
| 9847 |
+
{
|
| 9848 |
+
"epoch": 0.05483325289692928,
|
| 9849 |
+
"grad_norm": 8.3125,
|
| 9850 |
+
"learning_rate": 0.0004910210204157268,
|
| 9851 |
+
"loss": 18.3669,
|
| 9852 |
+
"step": 28120
|
| 9853 |
+
},
|
| 9854 |
+
{
|
| 9855 |
+
"epoch": 0.05487225236556152,
|
| 9856 |
+
"grad_norm": 8.1875,
|
| 9857 |
+
"learning_rate": 0.0004910145183848175,
|
| 9858 |
+
"loss": 18.3254,
|
| 9859 |
+
"step": 28140
|
| 9860 |
+
},
|
| 9861 |
+
{
|
| 9862 |
+
"epoch": 0.05491125183419376,
|
| 9863 |
+
"grad_norm": 7.90625,
|
| 9864 |
+
"learning_rate": 0.0004910080163539082,
|
| 9865 |
+
"loss": 18.3784,
|
| 9866 |
+
"step": 28160
|
| 9867 |
+
},
|
| 9868 |
+
{
|
| 9869 |
+
"epoch": 0.054950251302826,
|
| 9870 |
+
"grad_norm": 8.0625,
|
| 9871 |
+
"learning_rate": 0.0004910015143229988,
|
| 9872 |
+
"loss": 18.3403,
|
| 9873 |
+
"step": 28180
|
| 9874 |
+
},
|
| 9875 |
+
{
|
| 9876 |
+
"epoch": 0.05498925077145824,
|
| 9877 |
+
"grad_norm": 9.375,
|
| 9878 |
+
"learning_rate": 0.0004909950122920895,
|
| 9879 |
+
"loss": 18.3102,
|
| 9880 |
+
"step": 28200
|
| 9881 |
+
},
|
| 9882 |
+
{
|
| 9883 |
+
"epoch": 0.055028250240090476,
|
| 9884 |
+
"grad_norm": 9.0625,
|
| 9885 |
+
"learning_rate": 0.00049098851026118,
|
| 9886 |
+
"loss": 18.4089,
|
| 9887 |
+
"step": 28220
|
| 9888 |
+
},
|
| 9889 |
+
{
|
| 9890 |
+
"epoch": 0.05506724970872272,
|
| 9891 |
+
"grad_norm": 8.5625,
|
| 9892 |
+
"learning_rate": 0.0004909820082302707,
|
| 9893 |
+
"loss": 18.3223,
|
| 9894 |
+
"step": 28240
|
| 9895 |
+
},
|
| 9896 |
+
{
|
| 9897 |
+
"epoch": 0.05510624917735496,
|
| 9898 |
+
"grad_norm": 8.5625,
|
| 9899 |
+
"learning_rate": 0.0004909755061993613,
|
| 9900 |
+
"loss": 18.3313,
|
| 9901 |
+
"step": 28260
|
| 9902 |
+
},
|
| 9903 |
+
{
|
| 9904 |
+
"epoch": 0.055145248645987195,
|
| 9905 |
+
"grad_norm": 8.6875,
|
| 9906 |
+
"learning_rate": 0.000490969004168452,
|
| 9907 |
+
"loss": 18.3614,
|
| 9908 |
+
"step": 28280
|
| 9909 |
+
},
|
| 9910 |
+
{
|
| 9911 |
+
"epoch": 0.05518424811461944,
|
| 9912 |
+
"grad_norm": 8.25,
|
| 9913 |
+
"learning_rate": 0.0004909625021375427,
|
| 9914 |
+
"loss": 18.4159,
|
| 9915 |
+
"step": 28300
|
| 9916 |
+
},
|
| 9917 |
+
{
|
| 9918 |
+
"epoch": 0.05522324758325168,
|
| 9919 |
+
"grad_norm": 9.625,
|
| 9920 |
+
"learning_rate": 0.0004909560001066333,
|
| 9921 |
+
"loss": 18.3133,
|
| 9922 |
+
"step": 28320
|
| 9923 |
+
},
|
| 9924 |
+
{
|
| 9925 |
+
"epoch": 0.05526224705188392,
|
| 9926 |
+
"grad_norm": 8.625,
|
| 9927 |
+
"learning_rate": 0.000490949498075724,
|
| 9928 |
+
"loss": 18.2357,
|
| 9929 |
+
"step": 28340
|
| 9930 |
+
},
|
| 9931 |
+
{
|
| 9932 |
+
"epoch": 0.055301246520516156,
|
| 9933 |
+
"grad_norm": 8.625,
|
| 9934 |
+
"learning_rate": 0.0004909429960448146,
|
| 9935 |
+
"loss": 18.3533,
|
| 9936 |
+
"step": 28360
|
| 9937 |
+
},
|
| 9938 |
+
{
|
| 9939 |
+
"epoch": 0.0553402459891484,
|
| 9940 |
+
"grad_norm": 8.0625,
|
| 9941 |
+
"learning_rate": 0.0004909364940139053,
|
| 9942 |
+
"loss": 18.4396,
|
| 9943 |
+
"step": 28380
|
| 9944 |
+
},
|
| 9945 |
+
{
|
| 9946 |
+
"epoch": 0.05537924545778064,
|
| 9947 |
+
"grad_norm": 9.25,
|
| 9948 |
+
"learning_rate": 0.0004909299919829958,
|
| 9949 |
+
"loss": 18.298,
|
| 9950 |
+
"step": 28400
|
| 9951 |
+
},
|
| 9952 |
+
{
|
| 9953 |
+
"epoch": 0.055418244926412875,
|
| 9954 |
+
"grad_norm": 8.125,
|
| 9955 |
+
"learning_rate": 0.0004909234899520865,
|
| 9956 |
+
"loss": 18.271,
|
| 9957 |
+
"step": 28420
|
| 9958 |
+
},
|
| 9959 |
+
{
|
| 9960 |
+
"epoch": 0.05545724439504512,
|
| 9961 |
+
"grad_norm": 7.1875,
|
| 9962 |
+
"learning_rate": 0.0004909169879211771,
|
| 9963 |
+
"loss": 18.3477,
|
| 9964 |
+
"step": 28440
|
| 9965 |
+
},
|
| 9966 |
+
{
|
| 9967 |
+
"epoch": 0.05549624386367736,
|
| 9968 |
+
"grad_norm": 9.125,
|
| 9969 |
+
"learning_rate": 0.0004909104858902678,
|
| 9970 |
+
"loss": 18.346,
|
| 9971 |
+
"step": 28460
|
| 9972 |
+
},
|
| 9973 |
+
{
|
| 9974 |
+
"epoch": 0.055535243332309595,
|
| 9975 |
+
"grad_norm": 9.125,
|
| 9976 |
+
"learning_rate": 0.0004909039838593585,
|
| 9977 |
+
"loss": 18.3856,
|
| 9978 |
+
"step": 28480
|
| 9979 |
+
},
|
| 9980 |
+
{
|
| 9981 |
+
"epoch": 0.05557424280094184,
|
| 9982 |
+
"grad_norm": 8.0625,
|
| 9983 |
+
"learning_rate": 0.0004908974818284491,
|
| 9984 |
+
"loss": 18.3034,
|
| 9985 |
+
"step": 28500
|
| 9986 |
+
},
|
| 9987 |
+
{
|
| 9988 |
+
"epoch": 0.05561324226957408,
|
| 9989 |
+
"grad_norm": 9.375,
|
| 9990 |
+
"learning_rate": 0.0004908909797975398,
|
| 9991 |
+
"loss": 18.2843,
|
| 9992 |
+
"step": 28520
|
| 9993 |
+
},
|
| 9994 |
+
{
|
| 9995 |
+
"epoch": 0.055652241738206314,
|
| 9996 |
+
"grad_norm": 9.4375,
|
| 9997 |
+
"learning_rate": 0.0004908844777666304,
|
| 9998 |
+
"loss": 18.2606,
|
| 9999 |
+
"step": 28540
|
| 10000 |
+
},
|
| 10001 |
+
{
|
| 10002 |
+
"epoch": 0.055691241206838556,
|
| 10003 |
+
"grad_norm": 8.25,
|
| 10004 |
+
"learning_rate": 0.0004908779757357211,
|
| 10005 |
+
"loss": 18.3211,
|
| 10006 |
+
"step": 28560
|
| 10007 |
+
},
|
| 10008 |
+
{
|
| 10009 |
+
"epoch": 0.0557302406754708,
|
| 10010 |
+
"grad_norm": 8.0625,
|
| 10011 |
+
"learning_rate": 0.0004908714737048117,
|
| 10012 |
+
"loss": 18.2635,
|
| 10013 |
+
"step": 28580
|
| 10014 |
+
},
|
| 10015 |
+
{
|
| 10016 |
+
"epoch": 0.05576924014410304,
|
| 10017 |
+
"grad_norm": 8.4375,
|
| 10018 |
+
"learning_rate": 0.0004908649716739024,
|
| 10019 |
+
"loss": 18.3031,
|
| 10020 |
+
"step": 28600
|
| 10021 |
+
},
|
| 10022 |
+
{
|
| 10023 |
+
"epoch": 0.055808239612735275,
|
| 10024 |
+
"grad_norm": 8.1875,
|
| 10025 |
+
"learning_rate": 0.0004908584696429931,
|
| 10026 |
+
"loss": 18.3625,
|
| 10027 |
+
"step": 28620
|
| 10028 |
+
},
|
| 10029 |
+
{
|
| 10030 |
+
"epoch": 0.05584723908136752,
|
| 10031 |
+
"grad_norm": 9.125,
|
| 10032 |
+
"learning_rate": 0.0004908519676120837,
|
| 10033 |
+
"loss": 18.2676,
|
| 10034 |
+
"step": 28640
|
| 10035 |
+
},
|
| 10036 |
+
{
|
| 10037 |
+
"epoch": 0.05588623854999976,
|
| 10038 |
+
"grad_norm": 8.625,
|
| 10039 |
+
"learning_rate": 0.0004908454655811743,
|
| 10040 |
+
"loss": 18.356,
|
| 10041 |
+
"step": 28660
|
| 10042 |
+
},
|
| 10043 |
+
{
|
| 10044 |
+
"epoch": 0.055925238018631994,
|
| 10045 |
+
"grad_norm": 8.5625,
|
| 10046 |
+
"learning_rate": 0.0004908389635502649,
|
| 10047 |
+
"loss": 18.3445,
|
| 10048 |
+
"step": 28680
|
| 10049 |
+
},
|
| 10050 |
+
{
|
| 10051 |
+
"epoch": 0.055964237487264236,
|
| 10052 |
+
"grad_norm": 10.4375,
|
| 10053 |
+
"learning_rate": 0.0004908324615193556,
|
| 10054 |
+
"loss": 18.2686,
|
| 10055 |
+
"step": 28700
|
| 10056 |
+
},
|
| 10057 |
+
{
|
| 10058 |
+
"epoch": 0.05600323695589648,
|
| 10059 |
+
"grad_norm": 9.125,
|
| 10060 |
+
"learning_rate": 0.0004908259594884462,
|
| 10061 |
+
"loss": 18.3601,
|
| 10062 |
+
"step": 28720
|
| 10063 |
+
},
|
| 10064 |
+
{
|
| 10065 |
+
"epoch": 0.05604223642452871,
|
| 10066 |
+
"grad_norm": 8.5625,
|
| 10067 |
+
"learning_rate": 0.0004908194574575369,
|
| 10068 |
+
"loss": 18.2382,
|
| 10069 |
+
"step": 28740
|
| 10070 |
+
},
|
| 10071 |
+
{
|
| 10072 |
+
"epoch": 0.056081235893160955,
|
| 10073 |
+
"grad_norm": 8.5,
|
| 10074 |
+
"learning_rate": 0.0004908129554266275,
|
| 10075 |
+
"loss": 18.2971,
|
| 10076 |
+
"step": 28760
|
| 10077 |
+
},
|
| 10078 |
+
{
|
| 10079 |
+
"epoch": 0.0561202353617932,
|
| 10080 |
+
"grad_norm": 8.3125,
|
| 10081 |
+
"learning_rate": 0.0004908064533957182,
|
| 10082 |
+
"loss": 18.2238,
|
| 10083 |
+
"step": 28780
|
| 10084 |
+
},
|
| 10085 |
+
{
|
| 10086 |
+
"epoch": 0.05615923483042543,
|
| 10087 |
+
"grad_norm": 8.1875,
|
| 10088 |
+
"learning_rate": 0.0004907999513648089,
|
| 10089 |
+
"loss": 18.32,
|
| 10090 |
+
"step": 28800
|
| 10091 |
+
},
|
| 10092 |
+
{
|
| 10093 |
+
"epoch": 0.056198234299057674,
|
| 10094 |
+
"grad_norm": 9.25,
|
| 10095 |
+
"learning_rate": 0.0004907934493338995,
|
| 10096 |
+
"loss": 18.2592,
|
| 10097 |
+
"step": 28820
|
| 10098 |
+
},
|
| 10099 |
+
{
|
| 10100 |
+
"epoch": 0.056237233767689916,
|
| 10101 |
+
"grad_norm": 8.25,
|
| 10102 |
+
"learning_rate": 0.0004907869473029901,
|
| 10103 |
+
"loss": 18.3136,
|
| 10104 |
+
"step": 28840
|
| 10105 |
+
},
|
| 10106 |
+
{
|
| 10107 |
+
"epoch": 0.05627623323632216,
|
| 10108 |
+
"grad_norm": 7.875,
|
| 10109 |
+
"learning_rate": 0.0004907804452720807,
|
| 10110 |
+
"loss": 18.2562,
|
| 10111 |
+
"step": 28860
|
| 10112 |
+
},
|
| 10113 |
+
{
|
| 10114 |
+
"epoch": 0.05631523270495439,
|
| 10115 |
+
"grad_norm": 9.25,
|
| 10116 |
+
"learning_rate": 0.0004907739432411714,
|
| 10117 |
+
"loss": 18.2101,
|
| 10118 |
+
"step": 28880
|
| 10119 |
+
},
|
| 10120 |
+
{
|
| 10121 |
+
"epoch": 0.056354232173586635,
|
| 10122 |
+
"grad_norm": 8.75,
|
| 10123 |
+
"learning_rate": 0.000490767441210262,
|
| 10124 |
+
"loss": 18.3202,
|
| 10125 |
+
"step": 28900
|
| 10126 |
+
},
|
| 10127 |
+
{
|
| 10128 |
+
"epoch": 0.05639323164221888,
|
| 10129 |
+
"grad_norm": 9.5,
|
| 10130 |
+
"learning_rate": 0.0004907609391793527,
|
| 10131 |
+
"loss": 18.3016,
|
| 10132 |
+
"step": 28920
|
| 10133 |
+
},
|
| 10134 |
+
{
|
| 10135 |
+
"epoch": 0.05643223111085111,
|
| 10136 |
+
"grad_norm": 8.8125,
|
| 10137 |
+
"learning_rate": 0.0004907544371484434,
|
| 10138 |
+
"loss": 18.2731,
|
| 10139 |
+
"step": 28940
|
| 10140 |
+
},
|
| 10141 |
+
{
|
| 10142 |
+
"epoch": 0.056471230579483354,
|
| 10143 |
+
"grad_norm": 8.4375,
|
| 10144 |
+
"learning_rate": 0.000490747935117534,
|
| 10145 |
+
"loss": 18.2778,
|
| 10146 |
+
"step": 28960
|
| 10147 |
+
},
|
| 10148 |
+
{
|
| 10149 |
+
"epoch": 0.056510230048115596,
|
| 10150 |
+
"grad_norm": 7.1875,
|
| 10151 |
+
"learning_rate": 0.0004907414330866247,
|
| 10152 |
+
"loss": 18.2984,
|
| 10153 |
+
"step": 28980
|
| 10154 |
+
},
|
| 10155 |
+
{
|
| 10156 |
+
"epoch": 0.05654922951674783,
|
| 10157 |
+
"grad_norm": 8.3125,
|
| 10158 |
+
"learning_rate": 0.0004907349310557152,
|
| 10159 |
+
"loss": 18.2915,
|
| 10160 |
+
"step": 29000
|
| 10161 |
}
|
| 10162 |
],
|
| 10163 |
"logging_steps": 20,
|
|
|
|
| 10177 |
"attributes": {}
|
| 10178 |
}
|
| 10179 |
},
|
| 10180 |
+
"total_flos": 2.1319717710265844e+19,
|
| 10181 |
"train_batch_size": 48,
|
| 10182 |
"trial_name": null,
|
| 10183 |
"trial_params": null
|