Training in progress, step 67000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b6d95b7e811d1f68b64bc7cb8a6aa2be60af9ae27cf26bbdeedecc87fc96939
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4764d7e2e901d9dd421188980b44c73e20159a2b530b5e58e042540dbd4ca383
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2f0aa502d64898ee3e50486c039d0e2439e7552237090a80d559862b18540a7
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8f163bf0d684bb1f1d6d058d310158a309f623a594242fc874446ccea1105f8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11756,11 +11756,189 @@
|
|
| 11756 |
"eval_steps_per_second": 23.459,
|
| 11757 |
"num_input_tokens_seen": 17301504000,
|
| 11758 |
"step": 66000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11759 |
}
|
| 11760 |
],
|
| 11761 |
"logging_steps": 50,
|
| 11762 |
"max_steps": 70000,
|
| 11763 |
-
"num_input_tokens_seen":
|
| 11764 |
"num_train_epochs": 1,
|
| 11765 |
"save_steps": 1000,
|
| 11766 |
"stateful_callbacks": {
|
|
@@ -11775,7 +11953,7 @@
|
|
| 11775 |
"attributes": {}
|
| 11776 |
}
|
| 11777 |
},
|
| 11778 |
-
"total_flos": 4.
|
| 11779 |
"train_batch_size": 64,
|
| 11780 |
"trial_name": null,
|
| 11781 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4506754245682008,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 67000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11756 |
"eval_steps_per_second": 23.459,
|
| 11757 |
"num_input_tokens_seen": 17301504000,
|
| 11758 |
"step": 66000
|
| 11759 |
+
},
|
| 11760 |
+
{
|
| 11761 |
+
"epoch": 0.4442852506377562,
|
| 11762 |
+
"grad_norm": 0.14435406029224396,
|
| 11763 |
+
"learning_rate": 0.0001429927009743659,
|
| 11764 |
+
"loss": 2.9718,
|
| 11765 |
+
"num_input_tokens_seen": 17314611200,
|
| 11766 |
+
"step": 66050
|
| 11767 |
+
},
|
| 11768 |
+
{
|
| 11769 |
+
"epoch": 0.4446215755814638,
|
| 11770 |
+
"grad_norm": 0.1603071242570877,
|
| 11771 |
+
"learning_rate": 0.0001395732016485406,
|
| 11772 |
+
"loss": 2.9731,
|
| 11773 |
+
"num_input_tokens_seen": 17327718400,
|
| 11774 |
+
"step": 66100
|
| 11775 |
+
},
|
| 11776 |
+
{
|
| 11777 |
+
"epoch": 0.4449579005251714,
|
| 11778 |
+
"grad_norm": 0.14310726523399353,
|
| 11779 |
+
"learning_rate": 0.00013618844100771256,
|
| 11780 |
+
"loss": 2.9665,
|
| 11781 |
+
"num_input_tokens_seen": 17340825600,
|
| 11782 |
+
"step": 66150
|
| 11783 |
+
},
|
| 11784 |
+
{
|
| 11785 |
+
"epoch": 0.44529422546887903,
|
| 11786 |
+
"grad_norm": 0.276594340801239,
|
| 11787 |
+
"learning_rate": 0.00013283874528215734,
|
| 11788 |
+
"loss": 2.9711,
|
| 11789 |
+
"num_input_tokens_seen": 17353932800,
|
| 11790 |
+
"step": 66200
|
| 11791 |
+
},
|
| 11792 |
+
{
|
| 11793 |
+
"epoch": 0.44563055041258665,
|
| 11794 |
+
"grad_norm": 0.1535540074110031,
|
| 11795 |
+
"learning_rate": 0.00012952443732252057,
|
| 11796 |
+
"loss": 2.9693,
|
| 11797 |
+
"num_input_tokens_seen": 17367040000,
|
| 11798 |
+
"step": 66250
|
| 11799 |
+
},
|
| 11800 |
+
{
|
| 11801 |
+
"epoch": 0.44596687535629426,
|
| 11802 |
+
"grad_norm": 0.15807458758354187,
|
| 11803 |
+
"learning_rate": 0.00012624583656870153,
|
| 11804 |
+
"loss": 2.9754,
|
| 11805 |
+
"num_input_tokens_seen": 17380147200,
|
| 11806 |
+
"step": 66300
|
| 11807 |
+
},
|
| 11808 |
+
{
|
| 11809 |
+
"epoch": 0.44630320030000187,
|
| 11810 |
+
"grad_norm": 0.14477893710136414,
|
| 11811 |
+
"learning_rate": 0.00012300325901906528,
|
| 11812 |
+
"loss": 2.9735,
|
| 11813 |
+
"num_input_tokens_seen": 17393254400,
|
| 11814 |
+
"step": 66350
|
| 11815 |
+
},
|
| 11816 |
+
{
|
| 11817 |
+
"epoch": 0.4466395252437095,
|
| 11818 |
+
"grad_norm": 0.14505073428153992,
|
| 11819 |
+
"learning_rate": 0.00011979701719998454,
|
| 11820 |
+
"loss": 2.9783,
|
| 11821 |
+
"num_input_tokens_seen": 17406361600,
|
| 11822 |
+
"step": 66400
|
| 11823 |
+
},
|
| 11824 |
+
{
|
| 11825 |
+
"epoch": 0.4469758501874171,
|
| 11826 |
+
"grad_norm": 0.15850161015987396,
|
| 11827 |
+
"learning_rate": 0.00011662742013571926,
|
| 11828 |
+
"loss": 2.967,
|
| 11829 |
+
"num_input_tokens_seen": 17419468800,
|
| 11830 |
+
"step": 66450
|
| 11831 |
+
},
|
| 11832 |
+
{
|
| 11833 |
+
"epoch": 0.4473121751311247,
|
| 11834 |
+
"grad_norm": 0.14653578400611877,
|
| 11835 |
+
"learning_rate": 0.00011349477331863151,
|
| 11836 |
+
"loss": 2.9651,
|
| 11837 |
+
"num_input_tokens_seen": 17432576000,
|
| 11838 |
+
"step": 66500
|
| 11839 |
+
},
|
| 11840 |
+
{
|
| 11841 |
+
"epoch": 0.4473121751311247,
|
| 11842 |
+
"eval_loss": 2.8710148334503174,
|
| 11843 |
+
"eval_runtime": 53.2889,
|
| 11844 |
+
"eval_samples_per_second": 93.828,
|
| 11845 |
+
"eval_steps_per_second": 23.457,
|
| 11846 |
+
"num_input_tokens_seen": 17432576000,
|
| 11847 |
+
"step": 66500
|
| 11848 |
+
},
|
| 11849 |
+
{
|
| 11850 |
+
"epoch": 0.4476485000748323,
|
| 11851 |
+
"grad_norm": 0.15636616945266724,
|
| 11852 |
+
"learning_rate": 0.00011039937867974164,
|
| 11853 |
+
"loss": 2.9758,
|
| 11854 |
+
"num_input_tokens_seen": 17445683200,
|
| 11855 |
+
"step": 66550
|
| 11856 |
+
},
|
| 11857 |
+
{
|
| 11858 |
+
"epoch": 0.4479848250185399,
|
| 11859 |
+
"grad_norm": 0.14427579939365387,
|
| 11860 |
+
"learning_rate": 0.00010734153455962764,
|
| 11861 |
+
"loss": 2.9594,
|
| 11862 |
+
"num_input_tokens_seen": 17458790400,
|
| 11863 |
+
"step": 66600
|
| 11864 |
+
},
|
| 11865 |
+
{
|
| 11866 |
+
"epoch": 0.44832114996224753,
|
| 11867 |
+
"grad_norm": 0.15148353576660156,
|
| 11868 |
+
"learning_rate": 0.00010432153567966984,
|
| 11869 |
+
"loss": 2.9684,
|
| 11870 |
+
"num_input_tokens_seen": 17471897600,
|
| 11871 |
+
"step": 66650
|
| 11872 |
+
},
|
| 11873 |
+
{
|
| 11874 |
+
"epoch": 0.44865747490595514,
|
| 11875 |
+
"grad_norm": 0.1541094332933426,
|
| 11876 |
+
"learning_rate": 0.0001013396731136465,
|
| 11877 |
+
"loss": 2.9685,
|
| 11878 |
+
"num_input_tokens_seen": 17485004800,
|
| 11879 |
+
"step": 66700
|
| 11880 |
+
},
|
| 11881 |
+
{
|
| 11882 |
+
"epoch": 0.44899379984966276,
|
| 11883 |
+
"grad_norm": 0.14267295598983765,
|
| 11884 |
+
"learning_rate": 9.839623425967759e-05,
|
| 11885 |
+
"loss": 2.9728,
|
| 11886 |
+
"num_input_tokens_seen": 17498112000,
|
| 11887 |
+
"step": 66750
|
| 11888 |
+
},
|
| 11889 |
+
{
|
| 11890 |
+
"epoch": 0.44933012479337037,
|
| 11891 |
+
"grad_norm": 0.1437918245792389,
|
| 11892 |
+
"learning_rate": 9.549150281252633e-05,
|
| 11893 |
+
"loss": 2.9752,
|
| 11894 |
+
"num_input_tokens_seen": 17511219200,
|
| 11895 |
+
"step": 66800
|
| 11896 |
+
},
|
| 11897 |
+
{
|
| 11898 |
+
"epoch": 0.449666449737078,
|
| 11899 |
+
"grad_norm": 0.1517232209444046,
|
| 11900 |
+
"learning_rate": 9.262575873625529e-05,
|
| 11901 |
+
"loss": 2.9729,
|
| 11902 |
+
"num_input_tokens_seen": 17524326400,
|
| 11903 |
+
"step": 66850
|
| 11904 |
+
},
|
| 11905 |
+
{
|
| 11906 |
+
"epoch": 0.4500027746807856,
|
| 11907 |
+
"grad_norm": 0.15286608040332794,
|
| 11908 |
+
"learning_rate": 8.979927823724321e-05,
|
| 11909 |
+
"loss": 2.9687,
|
| 11910 |
+
"num_input_tokens_seen": 17537433600,
|
| 11911 |
+
"step": 66900
|
| 11912 |
+
},
|
| 11913 |
+
{
|
| 11914 |
+
"epoch": 0.4503390996244932,
|
| 11915 |
+
"grad_norm": 0.14875057339668274,
|
| 11916 |
+
"learning_rate": 8.70123337375635e-05,
|
| 11917 |
+
"loss": 2.9758,
|
| 11918 |
+
"num_input_tokens_seen": 17550540800,
|
| 11919 |
+
"step": 66950
|
| 11920 |
+
},
|
| 11921 |
+
{
|
| 11922 |
+
"epoch": 0.4506754245682008,
|
| 11923 |
+
"grad_norm": 0.1493612825870514,
|
| 11924 |
+
"learning_rate": 8.426519384872733e-05,
|
| 11925 |
+
"loss": 2.9704,
|
| 11926 |
+
"num_input_tokens_seen": 17563648000,
|
| 11927 |
+
"step": 67000
|
| 11928 |
+
},
|
| 11929 |
+
{
|
| 11930 |
+
"epoch": 0.4506754245682008,
|
| 11931 |
+
"eval_loss": 2.869231939315796,
|
| 11932 |
+
"eval_runtime": 53.2491,
|
| 11933 |
+
"eval_samples_per_second": 93.898,
|
| 11934 |
+
"eval_steps_per_second": 23.475,
|
| 11935 |
+
"num_input_tokens_seen": 17563648000,
|
| 11936 |
+
"step": 67000
|
| 11937 |
}
|
| 11938 |
],
|
| 11939 |
"logging_steps": 50,
|
| 11940 |
"max_steps": 70000,
|
| 11941 |
+
"num_input_tokens_seen": 17563648000,
|
| 11942 |
"num_train_epochs": 1,
|
| 11943 |
"save_steps": 1000,
|
| 11944 |
"stateful_callbacks": {
|
|
|
|
| 11953 |
"attributes": {}
|
| 11954 |
}
|
| 11955 |
},
|
| 11956 |
+
"total_flos": 4.69844726120448e+18,
|
| 11957 |
"train_batch_size": 64,
|
| 11958 |
"trial_name": null,
|
| 11959 |
"trial_params": null
|