Training in progress, step 56000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3d360be7fe2543c78a1f7ac85877b8ebcc55a8fc7ce7ea8871241b28859be01
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c586225c37191bdb386336c5aa7eba4c313537c276b8b87dd7fefbcb4a3ca975
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f1d7953b9adf97d81c8d5df7c90f2cd3786e196584c751d3c25ee459604bb2b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56641b065a04f5f757422df636842a91ff2acd7d071b6672db512bd44af71813
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9798,11 +9798,189 @@
|
|
| 9798 |
"eval_steps_per_second": 23.6,
|
| 9799 |
"num_input_tokens_seen": 14417920000,
|
| 9800 |
"step": 55000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9801 |
}
|
| 9802 |
],
|
| 9803 |
"logging_steps": 50,
|
| 9804 |
"max_steps": 60000,
|
| 9805 |
-
"num_input_tokens_seen":
|
| 9806 |
"num_train_epochs": 1,
|
| 9807 |
"save_steps": 1000,
|
| 9808 |
"stateful_callbacks": {
|
|
@@ -9817,7 +9995,7 @@
|
|
| 9817 |
"attributes": {}
|
| 9818 |
}
|
| 9819 |
},
|
| 9820 |
-
"total_flos": 3.
|
| 9821 |
"train_batch_size": 64,
|
| 9822 |
"trial_name": null,
|
| 9823 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.37668393695252606,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 56000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9798 |
"eval_steps_per_second": 23.6,
|
| 9799 |
"num_input_tokens_seen": 14417920000,
|
| 9800 |
"step": 55000
|
| 9801 |
+
},
|
| 9802 |
+
{
|
| 9803 |
+
"epoch": 0.3702937630220814,
|
| 9804 |
+
"grad_norm": 0.2545956075191498,
|
| 9805 |
+
"learning_rate": 0.0009263200821770461,
|
| 9806 |
+
"loss": 3.0397,
|
| 9807 |
+
"num_input_tokens_seen": 14431027200,
|
| 9808 |
+
"step": 55050
|
| 9809 |
+
},
|
| 9810 |
+
{
|
| 9811 |
+
"epoch": 0.370630087965789,
|
| 9812 |
+
"grad_norm": 0.26363736391067505,
|
| 9813 |
+
"learning_rate": 0.0009193352839727121,
|
| 9814 |
+
"loss": 3.0554,
|
| 9815 |
+
"num_input_tokens_seen": 14444134400,
|
| 9816 |
+
"step": 55100
|
| 9817 |
+
},
|
| 9818 |
+
{
|
| 9819 |
+
"epoch": 0.3709664129094966,
|
| 9820 |
+
"grad_norm": 0.2228112667798996,
|
| 9821 |
+
"learning_rate": 0.0009120630943110077,
|
| 9822 |
+
"loss": 3.0482,
|
| 9823 |
+
"num_input_tokens_seen": 14457241600,
|
| 9824 |
+
"step": 55150
|
| 9825 |
+
},
|
| 9826 |
+
{
|
| 9827 |
+
"epoch": 0.3713027378532043,
|
| 9828 |
+
"grad_norm": 0.2184106856584549,
|
| 9829 |
+
"learning_rate": 0.0009045084971874737,
|
| 9830 |
+
"loss": 3.0368,
|
| 9831 |
+
"num_input_tokens_seen": 14470348800,
|
| 9832 |
+
"step": 55200
|
| 9833 |
+
},
|
| 9834 |
+
{
|
| 9835 |
+
"epoch": 0.3716390627969119,
|
| 9836 |
+
"grad_norm": 0.5658212900161743,
|
| 9837 |
+
"learning_rate": 0.0008966766701456176,
|
| 9838 |
+
"loss": 3.0541,
|
| 9839 |
+
"num_input_tokens_seen": 14483456000,
|
| 9840 |
+
"step": 55250
|
| 9841 |
+
},
|
| 9842 |
+
{
|
| 9843 |
+
"epoch": 0.3719753877406195,
|
| 9844 |
+
"grad_norm": 0.31839439272880554,
|
| 9845 |
+
"learning_rate": 0.0008885729807284854,
|
| 9846 |
+
"loss": 3.0516,
|
| 9847 |
+
"num_input_tokens_seen": 14496563200,
|
| 9848 |
+
"step": 55300
|
| 9849 |
+
},
|
| 9850 |
+
{
|
| 9851 |
+
"epoch": 0.3723117126843271,
|
| 9852 |
+
"grad_norm": 0.2521055042743683,
|
| 9853 |
+
"learning_rate": 0.0008802029828000156,
|
| 9854 |
+
"loss": 3.049,
|
| 9855 |
+
"num_input_tokens_seen": 14509670400,
|
| 9856 |
+
"step": 55350
|
| 9857 |
+
},
|
| 9858 |
+
{
|
| 9859 |
+
"epoch": 0.3726480376280347,
|
| 9860 |
+
"grad_norm": 0.23797062039375305,
|
| 9861 |
+
"learning_rate": 0.0008715724127386971,
|
| 9862 |
+
"loss": 3.0393,
|
| 9863 |
+
"num_input_tokens_seen": 14522777600,
|
| 9864 |
+
"step": 55400
|
| 9865 |
+
},
|
| 9866 |
+
{
|
| 9867 |
+
"epoch": 0.37298436257174233,
|
| 9868 |
+
"grad_norm": 0.26673102378845215,
|
| 9869 |
+
"learning_rate": 0.0008626871855061438,
|
| 9870 |
+
"loss": 3.0535,
|
| 9871 |
+
"num_input_tokens_seen": 14535884800,
|
| 9872 |
+
"step": 55450
|
| 9873 |
+
},
|
| 9874 |
+
{
|
| 9875 |
+
"epoch": 0.37332068751544994,
|
| 9876 |
+
"grad_norm": 0.37754055857658386,
|
| 9877 |
+
"learning_rate": 0.0008535533905932737,
|
| 9878 |
+
"loss": 3.0432,
|
| 9879 |
+
"num_input_tokens_seen": 14548992000,
|
| 9880 |
+
"step": 55500
|
| 9881 |
+
},
|
| 9882 |
+
{
|
| 9883 |
+
"epoch": 0.37332068751544994,
|
| 9884 |
+
"eval_loss": 2.9362170696258545,
|
| 9885 |
+
"eval_runtime": 53.4795,
|
| 9886 |
+
"eval_samples_per_second": 93.494,
|
| 9887 |
+
"eval_steps_per_second": 23.373,
|
| 9888 |
+
"num_input_tokens_seen": 14548992000,
|
| 9889 |
+
"step": 55500
|
| 9890 |
+
},
|
| 9891 |
+
{
|
| 9892 |
+
"epoch": 0.37365701245915756,
|
| 9893 |
+
"grad_norm": 0.2160724252462387,
|
| 9894 |
+
"learning_rate": 0.000844177287846877,
|
| 9895 |
+
"loss": 3.0378,
|
| 9896 |
+
"num_input_tokens_seen": 14562099200,
|
| 9897 |
+
"step": 55550
|
| 9898 |
+
},
|
| 9899 |
+
{
|
| 9900 |
+
"epoch": 0.37399333740286517,
|
| 9901 |
+
"grad_norm": 0.22323860228061676,
|
| 9902 |
+
"learning_rate": 0.0008345653031794292,
|
| 9903 |
+
"loss": 3.0419,
|
| 9904 |
+
"num_input_tokens_seen": 14575206400,
|
| 9905 |
+
"step": 55600
|
| 9906 |
+
},
|
| 9907 |
+
{
|
| 9908 |
+
"epoch": 0.3743296623465728,
|
| 9909 |
+
"grad_norm": 0.19688346982002258,
|
| 9910 |
+
"learning_rate": 0.0008247240241650918,
|
| 9911 |
+
"loss": 3.0297,
|
| 9912 |
+
"num_input_tokens_seen": 14588313600,
|
| 9913 |
+
"step": 55650
|
| 9914 |
+
},
|
| 9915 |
+
{
|
| 9916 |
+
"epoch": 0.3746659872902804,
|
| 9917 |
+
"grad_norm": 0.1972673088312149,
|
| 9918 |
+
"learning_rate": 0.0008146601955249188,
|
| 9919 |
+
"loss": 3.0405,
|
| 9920 |
+
"num_input_tokens_seen": 14601420800,
|
| 9921 |
+
"step": 55700
|
| 9922 |
+
},
|
| 9923 |
+
{
|
| 9924 |
+
"epoch": 0.375002312233988,
|
| 9925 |
+
"grad_norm": 0.44073277711868286,
|
| 9926 |
+
"learning_rate": 0.0008043807145043603,
|
| 9927 |
+
"loss": 3.0343,
|
| 9928 |
+
"num_input_tokens_seen": 14614528000,
|
| 9929 |
+
"step": 55750
|
| 9930 |
+
},
|
| 9931 |
+
{
|
| 9932 |
+
"epoch": 0.3753386371776956,
|
| 9933 |
+
"grad_norm": 0.22042399644851685,
|
| 9934 |
+
"learning_rate": 0.0007938926261462366,
|
| 9935 |
+
"loss": 3.0337,
|
| 9936 |
+
"num_input_tokens_seen": 14627635200,
|
| 9937 |
+
"step": 55800
|
| 9938 |
+
},
|
| 9939 |
+
{
|
| 9940 |
+
"epoch": 0.3756749621214032,
|
| 9941 |
+
"grad_norm": 0.2954588234424591,
|
| 9942 |
+
"learning_rate": 0.0007832031184624164,
|
| 9943 |
+
"loss": 3.0334,
|
| 9944 |
+
"num_input_tokens_seen": 14640742400,
|
| 9945 |
+
"step": 55850
|
| 9946 |
+
},
|
| 9947 |
+
{
|
| 9948 |
+
"epoch": 0.37601128706511083,
|
| 9949 |
+
"grad_norm": 0.5062097907066345,
|
| 9950 |
+
"learning_rate": 0.0007723195175075137,
|
| 9951 |
+
"loss": 3.0385,
|
| 9952 |
+
"num_input_tokens_seen": 14653849600,
|
| 9953 |
+
"step": 55900
|
| 9954 |
+
},
|
| 9955 |
+
{
|
| 9956 |
+
"epoch": 0.37634761200881844,
|
| 9957 |
+
"grad_norm": 0.30344095826148987,
|
| 9958 |
+
"learning_rate": 0.0007612492823579744,
|
| 9959 |
+
"loss": 3.04,
|
| 9960 |
+
"num_input_tokens_seen": 14666956800,
|
| 9961 |
+
"step": 55950
|
| 9962 |
+
},
|
| 9963 |
+
{
|
| 9964 |
+
"epoch": 0.37668393695252606,
|
| 9965 |
+
"grad_norm": 0.21088473498821259,
|
| 9966 |
+
"learning_rate": 0.00075,
|
| 9967 |
+
"loss": 3.0364,
|
| 9968 |
+
"num_input_tokens_seen": 14680064000,
|
| 9969 |
+
"step": 56000
|
| 9970 |
+
},
|
| 9971 |
+
{
|
| 9972 |
+
"epoch": 0.37668393695252606,
|
| 9973 |
+
"eval_loss": 2.9313743114471436,
|
| 9974 |
+
"eval_runtime": 53.142,
|
| 9975 |
+
"eval_samples_per_second": 94.088,
|
| 9976 |
+
"eval_steps_per_second": 23.522,
|
| 9977 |
+
"num_input_tokens_seen": 14680064000,
|
| 9978 |
+
"step": 56000
|
| 9979 |
}
|
| 9980 |
],
|
| 9981 |
"logging_steps": 50,
|
| 9982 |
"max_steps": 60000,
|
| 9983 |
+
"num_input_tokens_seen": 14680064000,
|
| 9984 |
"num_train_epochs": 1,
|
| 9985 |
"save_steps": 1000,
|
| 9986 |
"stateful_callbacks": {
|
|
|
|
| 9995 |
"attributes": {}
|
| 9996 |
}
|
| 9997 |
},
|
| 9998 |
+
"total_flos": 3.92706039742464e+18,
|
| 9999 |
"train_batch_size": 64,
|
| 10000 |
"trial_name": null,
|
| 10001 |
"trial_params": null
|