Training in progress, step 67000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4df562358f0b3d93fdb48e67f5210b057adeffd8b788222cd6d30c1e17d16a45
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:095f32100e867e0fe913cd1c8e425177cd1f66e07c341665a191649c37a86bd3
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2f0aa502d64898ee3e50486c039d0e2439e7552237090a80d559862b18540a7
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00a7e117096eaa1f05b475c020696dc81b37bf94c840c6a7b407a88337130d26
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11756,11 +11756,189 @@
|
|
| 11756 |
"eval_steps_per_second": 23.453,
|
| 11757 |
"num_input_tokens_seen": 17301499456,
|
| 11758 |
"step": 66000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11759 |
}
|
| 11760 |
],
|
| 11761 |
"logging_steps": 50,
|
| 11762 |
"max_steps": 70000,
|
| 11763 |
-
"num_input_tokens_seen":
|
| 11764 |
"num_train_epochs": 1,
|
| 11765 |
"save_steps": 1000,
|
| 11766 |
"stateful_callbacks": {
|
|
@@ -11775,7 +11953,7 @@
|
|
| 11775 |
"attributes": {}
|
| 11776 |
}
|
| 11777 |
},
|
| 11778 |
-
"total_flos": 4.
|
| 11779 |
"train_batch_size": 64,
|
| 11780 |
"trial_name": null,
|
| 11781 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.31959168584614284,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 67000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11756 |
"eval_steps_per_second": 23.453,
|
| 11757 |
"num_input_tokens_seen": 17301499456,
|
| 11758 |
"step": 66000
|
| 11759 |
+
},
|
| 11760 |
+
{
|
| 11761 |
+
"epoch": 0.31506016194235426,
|
| 11762 |
+
"grad_norm": 0.16586218774318695,
|
| 11763 |
+
"learning_rate": 0.00018388874897104518,
|
| 11764 |
+
"loss": 2.5468,
|
| 11765 |
+
"num_input_tokens_seen": 17314606656,
|
| 11766 |
+
"step": 66050
|
| 11767 |
+
},
|
| 11768 |
+
{
|
| 11769 |
+
"epoch": 0.31529866320044836,
|
| 11770 |
+
"grad_norm": 0.1646813303232193,
|
| 11771 |
+
"learning_rate": 0.00017956219300748795,
|
| 11772 |
+
"loss": 2.5352,
|
| 11773 |
+
"num_input_tokens_seen": 17327713856,
|
| 11774 |
+
"step": 66100
|
| 11775 |
+
},
|
| 11776 |
+
{
|
| 11777 |
+
"epoch": 0.3155371644585425,
|
| 11778 |
+
"grad_norm": 0.18712937831878662,
|
| 11779 |
+
"learning_rate": 0.00017527597583490823,
|
| 11780 |
+
"loss": 2.5412,
|
| 11781 |
+
"num_input_tokens_seen": 17340821056,
|
| 11782 |
+
"step": 66150
|
| 11783 |
+
},
|
| 11784 |
+
{
|
| 11785 |
+
"epoch": 0.3157756657166367,
|
| 11786 |
+
"grad_norm": 0.1631355583667755,
|
| 11787 |
+
"learning_rate": 0.00017103063703014372,
|
| 11788 |
+
"loss": 2.5272,
|
| 11789 |
+
"num_input_tokens_seen": 17353928256,
|
| 11790 |
+
"step": 66200
|
| 11791 |
+
},
|
| 11792 |
+
{
|
| 11793 |
+
"epoch": 0.3160141669747308,
|
| 11794 |
+
"grad_norm": 0.15910203754901886,
|
| 11795 |
+
"learning_rate": 0.00016682671102399805,
|
| 11796 |
+
"loss": 2.5333,
|
| 11797 |
+
"num_input_tokens_seen": 17367035456,
|
| 11798 |
+
"step": 66250
|
| 11799 |
+
},
|
| 11800 |
+
{
|
| 11801 |
+
"epoch": 0.31625266823282494,
|
| 11802 |
+
"grad_norm": 0.5742849707603455,
|
| 11803 |
+
"learning_rate": 0.00016266472703396284,
|
| 11804 |
+
"loss": 2.5463,
|
| 11805 |
+
"num_input_tokens_seen": 17380142656,
|
| 11806 |
+
"step": 66300
|
| 11807 |
+
},
|
| 11808 |
+
{
|
| 11809 |
+
"epoch": 0.31649116949091904,
|
| 11810 |
+
"grad_norm": 0.17517830431461334,
|
| 11811 |
+
"learning_rate": 0.00015854520899759655,
|
| 11812 |
+
"loss": 2.5511,
|
| 11813 |
+
"num_input_tokens_seen": 17393249856,
|
| 11814 |
+
"step": 66350
|
| 11815 |
+
},
|
| 11816 |
+
{
|
| 11817 |
+
"epoch": 0.3167296707490132,
|
| 11818 |
+
"grad_norm": 0.6962131857872009,
|
| 11819 |
+
"learning_rate": 0.00015446867550656767,
|
| 11820 |
+
"loss": 2.5452,
|
| 11821 |
+
"num_input_tokens_seen": 17406357056,
|
| 11822 |
+
"step": 66400
|
| 11823 |
+
},
|
| 11824 |
+
{
|
| 11825 |
+
"epoch": 0.31696817200710736,
|
| 11826 |
+
"grad_norm": 0.16677837073802948,
|
| 11827 |
+
"learning_rate": 0.00015043563974137132,
|
| 11828 |
+
"loss": 2.5392,
|
| 11829 |
+
"num_input_tokens_seen": 17419464256,
|
| 11830 |
+
"step": 66450
|
| 11831 |
+
},
|
| 11832 |
+
{
|
| 11833 |
+
"epoch": 0.31720667326520147,
|
| 11834 |
+
"grad_norm": 0.16235870122909546,
|
| 11835 |
+
"learning_rate": 0.00014644660940672628,
|
| 11836 |
+
"loss": 2.5125,
|
| 11837 |
+
"num_input_tokens_seen": 17432571456,
|
| 11838 |
+
"step": 66500
|
| 11839 |
+
},
|
| 11840 |
+
{
|
| 11841 |
+
"epoch": 0.31720667326520147,
|
| 11842 |
+
"eval_loss": 2.419802188873291,
|
| 11843 |
+
"eval_runtime": 52.8641,
|
| 11844 |
+
"eval_samples_per_second": 94.582,
|
| 11845 |
+
"eval_steps_per_second": 23.646,
|
| 11846 |
+
"num_input_tokens_seen": 17432571456,
|
| 11847 |
+
"step": 66500
|
| 11848 |
+
},
|
| 11849 |
+
{
|
| 11850 |
+
"epoch": 0.3174451745232956,
|
| 11851 |
+
"grad_norm": 0.17308832705020905,
|
| 11852 |
+
"learning_rate": 0.00014250208666766236,
|
| 11853 |
+
"loss": 2.5349,
|
| 11854 |
+
"num_input_tokens_seen": 17445678656,
|
| 11855 |
+
"step": 66550
|
| 11856 |
+
},
|
| 11857 |
+
{
|
| 11858 |
+
"epoch": 0.31768367578138973,
|
| 11859 |
+
"grad_norm": 0.16299477219581604,
|
| 11860 |
+
"learning_rate": 0.00013860256808630427,
|
| 11861 |
+
"loss": 2.5277,
|
| 11862 |
+
"num_input_tokens_seen": 17458785856,
|
| 11863 |
+
"step": 66600
|
| 11864 |
+
},
|
| 11865 |
+
{
|
| 11866 |
+
"epoch": 0.3179221770394839,
|
| 11867 |
+
"grad_norm": 0.18277022242546082,
|
| 11868 |
+
"learning_rate": 0.00013474854455936125,
|
| 11869 |
+
"loss": 2.5203,
|
| 11870 |
+
"num_input_tokens_seen": 17471893056,
|
| 11871 |
+
"step": 66650
|
| 11872 |
+
},
|
| 11873 |
+
{
|
| 11874 |
+
"epoch": 0.318160678297578,
|
| 11875 |
+
"grad_norm": 0.16096614301204681,
|
| 11876 |
+
"learning_rate": 0.00013094050125632973,
|
| 11877 |
+
"loss": 2.535,
|
| 11878 |
+
"num_input_tokens_seen": 17485000256,
|
| 11879 |
+
"step": 66700
|
| 11880 |
+
},
|
| 11881 |
+
{
|
| 11882 |
+
"epoch": 0.31839917955567215,
|
| 11883 |
+
"grad_norm": 0.1723272204399109,
|
| 11884 |
+
"learning_rate": 0.0001271789175584172,
|
| 11885 |
+
"loss": 2.549,
|
| 11886 |
+
"num_input_tokens_seen": 17498107456,
|
| 11887 |
+
"step": 66750
|
| 11888 |
+
},
|
| 11889 |
+
{
|
| 11890 |
+
"epoch": 0.3186376808137663,
|
| 11891 |
+
"grad_norm": 0.15782694518566132,
|
| 11892 |
+
"learning_rate": 0.00012346426699819457,
|
| 11893 |
+
"loss": 2.5317,
|
| 11894 |
+
"num_input_tokens_seen": 17511214656,
|
| 11895 |
+
"step": 66800
|
| 11896 |
+
},
|
| 11897 |
+
{
|
| 11898 |
+
"epoch": 0.3188761820718604,
|
| 11899 |
+
"grad_norm": 0.1627569943666458,
|
| 11900 |
+
"learning_rate": 0.00011979701719998454,
|
| 11901 |
+
"loss": 2.5382,
|
| 11902 |
+
"num_input_tokens_seen": 17524321856,
|
| 11903 |
+
"step": 66850
|
| 11904 |
+
},
|
| 11905 |
+
{
|
| 11906 |
+
"epoch": 0.3191146833299546,
|
| 11907 |
+
"grad_norm": 0.16340333223342896,
|
| 11908 |
+
"learning_rate": 0.00011617762982099444,
|
| 11909 |
+
"loss": 2.5477,
|
| 11910 |
+
"num_input_tokens_seen": 17537429056,
|
| 11911 |
+
"step": 66900
|
| 11912 |
+
},
|
| 11913 |
+
{
|
| 11914 |
+
"epoch": 0.3193531845880487,
|
| 11915 |
+
"grad_norm": 0.15788671374320984,
|
| 11916 |
+
"learning_rate": 0.00011260656049319957,
|
| 11917 |
+
"loss": 2.537,
|
| 11918 |
+
"num_input_tokens_seen": 17550536256,
|
| 11919 |
+
"step": 66950
|
| 11920 |
+
},
|
| 11921 |
+
{
|
| 11922 |
+
"epoch": 0.31959168584614284,
|
| 11923 |
+
"grad_norm": 0.16191193461418152,
|
| 11924 |
+
"learning_rate": 0.0001090842587659851,
|
| 11925 |
+
"loss": 2.5394,
|
| 11926 |
+
"num_input_tokens_seen": 17563643456,
|
| 11927 |
+
"step": 67000
|
| 11928 |
+
},
|
| 11929 |
+
{
|
| 11930 |
+
"epoch": 0.31959168584614284,
|
| 11931 |
+
"eval_loss": 2.417813301086426,
|
| 11932 |
+
"eval_runtime": 53.532,
|
| 11933 |
+
"eval_samples_per_second": 93.402,
|
| 11934 |
+
"eval_steps_per_second": 23.351,
|
| 11935 |
+
"num_input_tokens_seen": 17563643456,
|
| 11936 |
+
"step": 67000
|
| 11937 |
}
|
| 11938 |
],
|
| 11939 |
"logging_steps": 50,
|
| 11940 |
"max_steps": 70000,
|
| 11941 |
+
"num_input_tokens_seen": 17563643456,
|
| 11942 |
"num_train_epochs": 1,
|
| 11943 |
"save_steps": 1000,
|
| 11944 |
"stateful_callbacks": {
|
|
|
|
| 11953 |
"attributes": {}
|
| 11954 |
}
|
| 11955 |
},
|
| 11956 |
+
"total_flos": 4.698446045640131e+18,
|
| 11957 |
"train_batch_size": 64,
|
| 11958 |
"trial_name": null,
|
| 11959 |
"trial_params": null
|