Training in progress, step 56000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75a54732bc39e58afccb21a46f57190dd49c2ae00c7fd73b4d8434827934d2aa
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da1643d7c66b6de7210d626427e81524686db0e0650499f03aeaee61e640ca95
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f1d7953b9adf97d81c8d5df7c90f2cd3786e196584c751d3c25ee459604bb2b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56dc1edb3d2e4264095d54347eab2555bc17fb9d10875074bfbbaaa6e5eeeb69
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9798,11 +9798,189 @@
|
|
| 9798 |
"eval_steps_per_second": 23.444,
|
| 9799 |
"num_input_tokens_seen": 14417915456,
|
| 9800 |
"step": 55000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9801 |
}
|
| 9802 |
],
|
| 9803 |
"logging_steps": 50,
|
| 9804 |
"max_steps": 70000,
|
| 9805 |
-
"num_input_tokens_seen":
|
| 9806 |
"num_train_epochs": 1,
|
| 9807 |
"save_steps": 1000,
|
| 9808 |
"stateful_callbacks": {
|
|
@@ -9817,7 +9995,7 @@
|
|
| 9817 |
"attributes": {}
|
| 9818 |
}
|
| 9819 |
},
|
| 9820 |
-
"total_flos": 3.
|
| 9821 |
"train_batch_size": 64,
|
| 9822 |
"trial_name": null,
|
| 9823 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2671214090654328,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 56000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9798 |
"eval_steps_per_second": 23.444,
|
| 9799 |
"num_input_tokens_seen": 14417915456,
|
| 9800 |
"step": 55000
|
| 9801 |
+
},
|
| 9802 |
+
{
|
| 9803 |
+
"epoch": 0.2625898851616442,
|
| 9804 |
+
"grad_norm": 0.22046101093292236,
|
| 9805 |
+
"learning_rate": 0.001,
|
| 9806 |
+
"loss": 2.6077,
|
| 9807 |
+
"num_input_tokens_seen": 14431022656,
|
| 9808 |
+
"step": 55050
|
| 9809 |
+
},
|
| 9810 |
+
{
|
| 9811 |
+
"epoch": 0.2628283864197384,
|
| 9812 |
+
"grad_norm": 0.4682837724685669,
|
| 9813 |
+
"learning_rate": 0.001,
|
| 9814 |
+
"loss": 2.6065,
|
| 9815 |
+
"num_input_tokens_seen": 14444129856,
|
| 9816 |
+
"step": 55100
|
| 9817 |
+
},
|
| 9818 |
+
{
|
| 9819 |
+
"epoch": 0.2630668876778325,
|
| 9820 |
+
"grad_norm": 0.21442484855651855,
|
| 9821 |
+
"learning_rate": 0.001,
|
| 9822 |
+
"loss": 2.6079,
|
| 9823 |
+
"num_input_tokens_seen": 14457237056,
|
| 9824 |
+
"step": 55150
|
| 9825 |
+
},
|
| 9826 |
+
{
|
| 9827 |
+
"epoch": 0.26330538893592664,
|
| 9828 |
+
"grad_norm": 0.2513403296470642,
|
| 9829 |
+
"learning_rate": 0.001,
|
| 9830 |
+
"loss": 2.6037,
|
| 9831 |
+
"num_input_tokens_seen": 14470344256,
|
| 9832 |
+
"step": 55200
|
| 9833 |
+
},
|
| 9834 |
+
{
|
| 9835 |
+
"epoch": 0.26354389019402075,
|
| 9836 |
+
"grad_norm": 0.21526487171649933,
|
| 9837 |
+
"learning_rate": 0.001,
|
| 9838 |
+
"loss": 2.6049,
|
| 9839 |
+
"num_input_tokens_seen": 14483451456,
|
| 9840 |
+
"step": 55250
|
| 9841 |
+
},
|
| 9842 |
+
{
|
| 9843 |
+
"epoch": 0.2637823914521149,
|
| 9844 |
+
"grad_norm": 0.22567112743854523,
|
| 9845 |
+
"learning_rate": 0.001,
|
| 9846 |
+
"loss": 2.5953,
|
| 9847 |
+
"num_input_tokens_seen": 14496558656,
|
| 9848 |
+
"step": 55300
|
| 9849 |
+
},
|
| 9850 |
+
{
|
| 9851 |
+
"epoch": 0.26402089271020907,
|
| 9852 |
+
"grad_norm": 0.20226064324378967,
|
| 9853 |
+
"learning_rate": 0.001,
|
| 9854 |
+
"loss": 2.609,
|
| 9855 |
+
"num_input_tokens_seen": 14509665856,
|
| 9856 |
+
"step": 55350
|
| 9857 |
+
},
|
| 9858 |
+
{
|
| 9859 |
+
"epoch": 0.26425939396830317,
|
| 9860 |
+
"grad_norm": 0.31736019253730774,
|
| 9861 |
+
"learning_rate": 0.001,
|
| 9862 |
+
"loss": 2.6174,
|
| 9863 |
+
"num_input_tokens_seen": 14522773056,
|
| 9864 |
+
"step": 55400
|
| 9865 |
+
},
|
| 9866 |
+
{
|
| 9867 |
+
"epoch": 0.26449789522639733,
|
| 9868 |
+
"grad_norm": 0.2573414146900177,
|
| 9869 |
+
"learning_rate": 0.001,
|
| 9870 |
+
"loss": 2.612,
|
| 9871 |
+
"num_input_tokens_seen": 14535880256,
|
| 9872 |
+
"step": 55450
|
| 9873 |
+
},
|
| 9874 |
+
{
|
| 9875 |
+
"epoch": 0.26473639648449143,
|
| 9876 |
+
"grad_norm": 0.278160959482193,
|
| 9877 |
+
"learning_rate": 0.001,
|
| 9878 |
+
"loss": 2.6713,
|
| 9879 |
+
"num_input_tokens_seen": 14548987456,
|
| 9880 |
+
"step": 55500
|
| 9881 |
+
},
|
| 9882 |
+
{
|
| 9883 |
+
"epoch": 0.26473639648449143,
|
| 9884 |
+
"eval_loss": 2.5104730129241943,
|
| 9885 |
+
"eval_runtime": 54.2403,
|
| 9886 |
+
"eval_samples_per_second": 92.182,
|
| 9887 |
+
"eval_steps_per_second": 23.046,
|
| 9888 |
+
"num_input_tokens_seen": 14548987456,
|
| 9889 |
+
"step": 55500
|
| 9890 |
+
},
|
| 9891 |
+
{
|
| 9892 |
+
"epoch": 0.2649748977425856,
|
| 9893 |
+
"grad_norm": 0.25843819975852966,
|
| 9894 |
+
"learning_rate": 0.001,
|
| 9895 |
+
"loss": 2.6223,
|
| 9896 |
+
"num_input_tokens_seen": 14562094656,
|
| 9897 |
+
"step": 55550
|
| 9898 |
+
},
|
| 9899 |
+
{
|
| 9900 |
+
"epoch": 0.26521339900067975,
|
| 9901 |
+
"grad_norm": 0.42813193798065186,
|
| 9902 |
+
"learning_rate": 0.001,
|
| 9903 |
+
"loss": 2.6114,
|
| 9904 |
+
"num_input_tokens_seen": 14575201856,
|
| 9905 |
+
"step": 55600
|
| 9906 |
+
},
|
| 9907 |
+
{
|
| 9908 |
+
"epoch": 0.26545190025877385,
|
| 9909 |
+
"grad_norm": 0.23324181139469147,
|
| 9910 |
+
"learning_rate": 0.001,
|
| 9911 |
+
"loss": 2.6149,
|
| 9912 |
+
"num_input_tokens_seen": 14588309056,
|
| 9913 |
+
"step": 55650
|
| 9914 |
+
},
|
| 9915 |
+
{
|
| 9916 |
+
"epoch": 0.265690401516868,
|
| 9917 |
+
"grad_norm": 0.2795487940311432,
|
| 9918 |
+
"learning_rate": 0.001,
|
| 9919 |
+
"loss": 2.6067,
|
| 9920 |
+
"num_input_tokens_seen": 14601416256,
|
| 9921 |
+
"step": 55700
|
| 9922 |
+
},
|
| 9923 |
+
{
|
| 9924 |
+
"epoch": 0.2659289027749621,
|
| 9925 |
+
"grad_norm": 0.6856834888458252,
|
| 9926 |
+
"learning_rate": 0.001,
|
| 9927 |
+
"loss": 2.6135,
|
| 9928 |
+
"num_input_tokens_seen": 14614523456,
|
| 9929 |
+
"step": 55750
|
| 9930 |
+
},
|
| 9931 |
+
{
|
| 9932 |
+
"epoch": 0.2661674040330563,
|
| 9933 |
+
"grad_norm": 0.348906934261322,
|
| 9934 |
+
"learning_rate": 0.001,
|
| 9935 |
+
"loss": 2.6384,
|
| 9936 |
+
"num_input_tokens_seen": 14627630656,
|
| 9937 |
+
"step": 55800
|
| 9938 |
+
},
|
| 9939 |
+
{
|
| 9940 |
+
"epoch": 0.26640590529115044,
|
| 9941 |
+
"grad_norm": 0.2510247528553009,
|
| 9942 |
+
"learning_rate": 0.001,
|
| 9943 |
+
"loss": 2.6224,
|
| 9944 |
+
"num_input_tokens_seen": 14640737856,
|
| 9945 |
+
"step": 55850
|
| 9946 |
+
},
|
| 9947 |
+
{
|
| 9948 |
+
"epoch": 0.26664440654924454,
|
| 9949 |
+
"grad_norm": 0.34429189562797546,
|
| 9950 |
+
"learning_rate": 0.001,
|
| 9951 |
+
"loss": 2.6139,
|
| 9952 |
+
"num_input_tokens_seen": 14653845056,
|
| 9953 |
+
"step": 55900
|
| 9954 |
+
},
|
| 9955 |
+
{
|
| 9956 |
+
"epoch": 0.2668829078073387,
|
| 9957 |
+
"grad_norm": 0.25697243213653564,
|
| 9958 |
+
"learning_rate": 0.001,
|
| 9959 |
+
"loss": 2.6143,
|
| 9960 |
+
"num_input_tokens_seen": 14666952256,
|
| 9961 |
+
"step": 55950
|
| 9962 |
+
},
|
| 9963 |
+
{
|
| 9964 |
+
"epoch": 0.2671214090654328,
|
| 9965 |
+
"grad_norm": 0.2812611758708954,
|
| 9966 |
+
"learning_rate": 0.001,
|
| 9967 |
+
"loss": 2.6172,
|
| 9968 |
+
"num_input_tokens_seen": 14680059456,
|
| 9969 |
+
"step": 56000
|
| 9970 |
+
},
|
| 9971 |
+
{
|
| 9972 |
+
"epoch": 0.2671214090654328,
|
| 9973 |
+
"eval_loss": 2.492490291595459,
|
| 9974 |
+
"eval_runtime": 53.3814,
|
| 9975 |
+
"eval_samples_per_second": 93.666,
|
| 9976 |
+
"eval_steps_per_second": 23.416,
|
| 9977 |
+
"num_input_tokens_seen": 14680059456,
|
| 9978 |
+
"step": 56000
|
| 9979 |
}
|
| 9980 |
],
|
| 9981 |
"logging_steps": 50,
|
| 9982 |
"max_steps": 70000,
|
| 9983 |
+
"num_input_tokens_seen": 14680059456,
|
| 9984 |
"num_train_epochs": 1,
|
| 9985 |
"save_steps": 1000,
|
| 9986 |
"stateful_callbacks": {
|
|
|
|
| 9995 |
"attributes": {}
|
| 9996 |
}
|
| 9997 |
},
|
| 9998 |
+
"total_flos": 3.9270591818602906e+18,
|
| 9999 |
"train_batch_size": 64,
|
| 10000 |
"trial_name": null,
|
| 10001 |
"trial_params": null
|