Training in progress, step 21000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 373077376
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba9d75a78fad20f4b1e389f6c85dda0f453be86d800ed2eba32953160cc02033
|
| 3 |
size 373077376
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 209816139
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df2f641838670afd6d1bb0181e8efde74cebba7ddaeaad933397844d1eb9afb6
|
| 3 |
size 209816139
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eda74d083cd5d9b07d403914b5a235c44dd87bc93a29636e940f36b95f8743f9
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91f1feed6ec98326449107f6ac06aad035f8176b90aa697c6edf6a509039a50c
|
| 3 |
size 14917
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1401
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6645e7dc37725bbae83eaf70fb81001a75be54d9a6554f43743dfb20cfc0984
|
| 3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 2000,
|
| 3 |
"best_metric": 9.218317031860352,
|
| 4 |
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5768,6 +5768,294 @@
|
|
| 5768 |
"eval_samples_per_second": 50.82,
|
| 5769 |
"eval_steps_per_second": 3.184,
|
| 5770 |
"step": 20000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5771 |
}
|
| 5772 |
],
|
| 5773 |
"logging_steps": 25,
|
|
@@ -5787,7 +6075,7 @@
|
|
| 5787 |
"attributes": {}
|
| 5788 |
}
|
| 5789 |
},
|
| 5790 |
-
"total_flos": 2.
|
| 5791 |
"train_batch_size": 8,
|
| 5792 |
"trial_name": null,
|
| 5793 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 2000,
|
| 3 |
"best_metric": 9.218317031860352,
|
| 4 |
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
|
| 5 |
+
"epoch": 0.06558523894888724,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 21000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5768 |
"eval_samples_per_second": 50.82,
|
| 5769 |
"eval_steps_per_second": 3.184,
|
| 5770 |
"step": 20000
|
| 5771 |
+
},
|
| 5772 |
+
{
|
| 5773 |
+
"epoch": 0.0625402099976889,
|
| 5774 |
+
"grad_norm": 50.0,
|
| 5775 |
+
"learning_rate": 0.000999570404664504,
|
| 5776 |
+
"loss": 34.3706,
|
| 5777 |
+
"step": 20025
|
| 5778 |
+
},
|
| 5779 |
+
{
|
| 5780 |
+
"epoch": 0.06261828766310425,
|
| 5781 |
+
"grad_norm": 45.75,
|
| 5782 |
+
"learning_rate": 0.0009995650375662492,
|
| 5783 |
+
"loss": 34.1775,
|
| 5784 |
+
"step": 20050
|
| 5785 |
+
},
|
| 5786 |
+
{
|
| 5787 |
+
"epoch": 0.06269636532851959,
|
| 5788 |
+
"grad_norm": 43.5,
|
| 5789 |
+
"learning_rate": 0.0009995596371637897,
|
| 5790 |
+
"loss": 34.3327,
|
| 5791 |
+
"step": 20075
|
| 5792 |
+
},
|
| 5793 |
+
{
|
| 5794 |
+
"epoch": 0.06277444299393492,
|
| 5795 |
+
"grad_norm": 43.25,
|
| 5796 |
+
"learning_rate": 0.0009995542034574863,
|
| 5797 |
+
"loss": 34.3871,
|
| 5798 |
+
"step": 20100
|
| 5799 |
+
},
|
| 5800 |
+
{
|
| 5801 |
+
"epoch": 0.06285252065935026,
|
| 5802 |
+
"grad_norm": 42.75,
|
| 5803 |
+
"learning_rate": 0.0009995487364477004,
|
| 5804 |
+
"loss": 33.8116,
|
| 5805 |
+
"step": 20125
|
| 5806 |
+
},
|
| 5807 |
+
{
|
| 5808 |
+
"epoch": 0.06293059832476561,
|
| 5809 |
+
"grad_norm": 37.5,
|
| 5810 |
+
"learning_rate": 0.0009995432361347971,
|
| 5811 |
+
"loss": 33.9015,
|
| 5812 |
+
"step": 20150
|
| 5813 |
+
},
|
| 5814 |
+
{
|
| 5815 |
+
"epoch": 0.06300867599018095,
|
| 5816 |
+
"grad_norm": 38.5,
|
| 5817 |
+
"learning_rate": 0.0009995377025191427,
|
| 5818 |
+
"loss": 33.8639,
|
| 5819 |
+
"step": 20175
|
| 5820 |
+
},
|
| 5821 |
+
{
|
| 5822 |
+
"epoch": 0.0630867536555963,
|
| 5823 |
+
"grad_norm": 37.25,
|
| 5824 |
+
"learning_rate": 0.0009995321356011063,
|
| 5825 |
+
"loss": 33.6663,
|
| 5826 |
+
"step": 20200
|
| 5827 |
+
},
|
| 5828 |
+
{
|
| 5829 |
+
"epoch": 0.06316483132101164,
|
| 5830 |
+
"grad_norm": 40.5,
|
| 5831 |
+
"learning_rate": 0.0009995265353810589,
|
| 5832 |
+
"loss": 33.8264,
|
| 5833 |
+
"step": 20225
|
| 5834 |
+
},
|
| 5835 |
+
{
|
| 5836 |
+
"epoch": 0.06324290898642698,
|
| 5837 |
+
"grad_norm": 45.25,
|
| 5838 |
+
"learning_rate": 0.0009995209018593737,
|
| 5839 |
+
"loss": 33.6851,
|
| 5840 |
+
"step": 20250
|
| 5841 |
+
},
|
| 5842 |
+
{
|
| 5843 |
+
"epoch": 0.06332098665184233,
|
| 5844 |
+
"grad_norm": 42.0,
|
| 5845 |
+
"learning_rate": 0.0009995152350364266,
|
| 5846 |
+
"loss": 33.5799,
|
| 5847 |
+
"step": 20275
|
| 5848 |
+
},
|
| 5849 |
+
{
|
| 5850 |
+
"epoch": 0.06339906431725766,
|
| 5851 |
+
"grad_norm": 43.25,
|
| 5852 |
+
"learning_rate": 0.000999509534912595,
|
| 5853 |
+
"loss": 33.6905,
|
| 5854 |
+
"step": 20300
|
| 5855 |
+
},
|
| 5856 |
+
{
|
| 5857 |
+
"epoch": 0.063477141982673,
|
| 5858 |
+
"grad_norm": 37.25,
|
| 5859 |
+
"learning_rate": 0.0009995038014882593,
|
| 5860 |
+
"loss": 33.4839,
|
| 5861 |
+
"step": 20325
|
| 5862 |
+
},
|
| 5863 |
+
{
|
| 5864 |
+
"epoch": 0.06355521964808834,
|
| 5865 |
+
"grad_norm": 35.75,
|
| 5866 |
+
"learning_rate": 0.0009994980347638016,
|
| 5867 |
+
"loss": 33.6105,
|
| 5868 |
+
"step": 20350
|
| 5869 |
+
},
|
| 5870 |
+
{
|
| 5871 |
+
"epoch": 0.06363329731350369,
|
| 5872 |
+
"grad_norm": 38.0,
|
| 5873 |
+
"learning_rate": 0.0009994922347396063,
|
| 5874 |
+
"loss": 33.9047,
|
| 5875 |
+
"step": 20375
|
| 5876 |
+
},
|
| 5877 |
+
{
|
| 5878 |
+
"epoch": 0.06371137497891903,
|
| 5879 |
+
"grad_norm": 40.25,
|
| 5880 |
+
"learning_rate": 0.00099948640141606,
|
| 5881 |
+
"loss": 34.1876,
|
| 5882 |
+
"step": 20400
|
| 5883 |
+
},
|
| 5884 |
+
{
|
| 5885 |
+
"epoch": 0.06378945264433437,
|
| 5886 |
+
"grad_norm": 45.75,
|
| 5887 |
+
"learning_rate": 0.0009994805347935517,
|
| 5888 |
+
"loss": 33.9303,
|
| 5889 |
+
"step": 20425
|
| 5890 |
+
},
|
| 5891 |
+
{
|
| 5892 |
+
"epoch": 0.06386753030974972,
|
| 5893 |
+
"grad_norm": 42.75,
|
| 5894 |
+
"learning_rate": 0.0009994746348724727,
|
| 5895 |
+
"loss": 33.951,
|
| 5896 |
+
"step": 20450
|
| 5897 |
+
},
|
| 5898 |
+
{
|
| 5899 |
+
"epoch": 0.06394560797516506,
|
| 5900 |
+
"grad_norm": 50.0,
|
| 5901 |
+
"learning_rate": 0.000999468701653216,
|
| 5902 |
+
"loss": 34.056,
|
| 5903 |
+
"step": 20475
|
| 5904 |
+
},
|
| 5905 |
+
{
|
| 5906 |
+
"epoch": 0.0640236856405804,
|
| 5907 |
+
"grad_norm": 50.5,
|
| 5908 |
+
"learning_rate": 0.0009994627351361772,
|
| 5909 |
+
"loss": 33.9114,
|
| 5910 |
+
"step": 20500
|
| 5911 |
+
},
|
| 5912 |
+
{
|
| 5913 |
+
"epoch": 0.06410176330599573,
|
| 5914 |
+
"grad_norm": 42.25,
|
| 5915 |
+
"learning_rate": 0.0009994567353217541,
|
| 5916 |
+
"loss": 34.2422,
|
| 5917 |
+
"step": 20525
|
| 5918 |
+
},
|
| 5919 |
+
{
|
| 5920 |
+
"epoch": 0.06417984097141108,
|
| 5921 |
+
"grad_norm": 44.25,
|
| 5922 |
+
"learning_rate": 0.0009994507022103465,
|
| 5923 |
+
"loss": 34.0631,
|
| 5924 |
+
"step": 20550
|
| 5925 |
+
},
|
| 5926 |
+
{
|
| 5927 |
+
"epoch": 0.06425791863682642,
|
| 5928 |
+
"grad_norm": 39.75,
|
| 5929 |
+
"learning_rate": 0.000999444635802357,
|
| 5930 |
+
"loss": 33.8447,
|
| 5931 |
+
"step": 20575
|
| 5932 |
+
},
|
| 5933 |
+
{
|
| 5934 |
+
"epoch": 0.06433599630224177,
|
| 5935 |
+
"grad_norm": 44.75,
|
| 5936 |
+
"learning_rate": 0.00099943853609819,
|
| 5937 |
+
"loss": 33.8587,
|
| 5938 |
+
"step": 20600
|
| 5939 |
+
},
|
| 5940 |
+
{
|
| 5941 |
+
"epoch": 0.06441407396765711,
|
| 5942 |
+
"grad_norm": 39.25,
|
| 5943 |
+
"learning_rate": 0.0009994324030982518,
|
| 5944 |
+
"loss": 33.943,
|
| 5945 |
+
"step": 20625
|
| 5946 |
+
},
|
| 5947 |
+
{
|
| 5948 |
+
"epoch": 0.06449215163307245,
|
| 5949 |
+
"grad_norm": 41.75,
|
| 5950 |
+
"learning_rate": 0.0009994262368029515,
|
| 5951 |
+
"loss": 33.9425,
|
| 5952 |
+
"step": 20650
|
| 5953 |
+
},
|
| 5954 |
+
{
|
| 5955 |
+
"epoch": 0.0645702292984878,
|
| 5956 |
+
"grad_norm": 44.5,
|
| 5957 |
+
"learning_rate": 0.0009994200372127,
|
| 5958 |
+
"loss": 34.0832,
|
| 5959 |
+
"step": 20675
|
| 5960 |
+
},
|
| 5961 |
+
{
|
| 5962 |
+
"epoch": 0.06464830696390314,
|
| 5963 |
+
"grad_norm": 39.25,
|
| 5964 |
+
"learning_rate": 0.000999413804327911,
|
| 5965 |
+
"loss": 33.9888,
|
| 5966 |
+
"step": 20700
|
| 5967 |
+
},
|
| 5968 |
+
{
|
| 5969 |
+
"epoch": 0.06472638462931847,
|
| 5970 |
+
"grad_norm": 43.75,
|
| 5971 |
+
"learning_rate": 0.0009994075381489994,
|
| 5972 |
+
"loss": 34.1022,
|
| 5973 |
+
"step": 20725
|
| 5974 |
+
},
|
| 5975 |
+
{
|
| 5976 |
+
"epoch": 0.06480446229473381,
|
| 5977 |
+
"grad_norm": 44.25,
|
| 5978 |
+
"learning_rate": 0.0009994012386763836,
|
| 5979 |
+
"loss": 33.9719,
|
| 5980 |
+
"step": 20750
|
| 5981 |
+
},
|
| 5982 |
+
{
|
| 5983 |
+
"epoch": 0.06488253996014916,
|
| 5984 |
+
"grad_norm": 42.0,
|
| 5985 |
+
"learning_rate": 0.000999394905910483,
|
| 5986 |
+
"loss": 33.7568,
|
| 5987 |
+
"step": 20775
|
| 5988 |
+
},
|
| 5989 |
+
{
|
| 5990 |
+
"epoch": 0.0649606176255645,
|
| 5991 |
+
"grad_norm": 43.75,
|
| 5992 |
+
"learning_rate": 0.0009993885398517201,
|
| 5993 |
+
"loss": 33.7079,
|
| 5994 |
+
"step": 20800
|
| 5995 |
+
},
|
| 5996 |
+
{
|
| 5997 |
+
"epoch": 0.06503869529097984,
|
| 5998 |
+
"grad_norm": 40.0,
|
| 5999 |
+
"learning_rate": 0.0009993821405005195,
|
| 6000 |
+
"loss": 33.8396,
|
| 6001 |
+
"step": 20825
|
| 6002 |
+
},
|
| 6003 |
+
{
|
| 6004 |
+
"epoch": 0.06511677295639519,
|
| 6005 |
+
"grad_norm": 42.5,
|
| 6006 |
+
"learning_rate": 0.0009993757078573073,
|
| 6007 |
+
"loss": 33.6027,
|
| 6008 |
+
"step": 20850
|
| 6009 |
+
},
|
| 6010 |
+
{
|
| 6011 |
+
"epoch": 0.06519485062181053,
|
| 6012 |
+
"grad_norm": 42.5,
|
| 6013 |
+
"learning_rate": 0.0009993692419225126,
|
| 6014 |
+
"loss": 33.5388,
|
| 6015 |
+
"step": 20875
|
| 6016 |
+
},
|
| 6017 |
+
{
|
| 6018 |
+
"epoch": 0.06527292828722588,
|
| 6019 |
+
"grad_norm": 55.0,
|
| 6020 |
+
"learning_rate": 0.0009993627426965667,
|
| 6021 |
+
"loss": 33.775,
|
| 6022 |
+
"step": 20900
|
| 6023 |
+
},
|
| 6024 |
+
{
|
| 6025 |
+
"epoch": 0.0653510059526412,
|
| 6026 |
+
"grad_norm": 39.0,
|
| 6027 |
+
"learning_rate": 0.0009993562101799024,
|
| 6028 |
+
"loss": 33.8984,
|
| 6029 |
+
"step": 20925
|
| 6030 |
+
},
|
| 6031 |
+
{
|
| 6032 |
+
"epoch": 0.06542908361805655,
|
| 6033 |
+
"grad_norm": 41.5,
|
| 6034 |
+
"learning_rate": 0.0009993496443729557,
|
| 6035 |
+
"loss": 33.8582,
|
| 6036 |
+
"step": 20950
|
| 6037 |
+
},
|
| 6038 |
+
{
|
| 6039 |
+
"epoch": 0.06550716128347189,
|
| 6040 |
+
"grad_norm": 37.25,
|
| 6041 |
+
"learning_rate": 0.0009993430452761639,
|
| 6042 |
+
"loss": 33.8915,
|
| 6043 |
+
"step": 20975
|
| 6044 |
+
},
|
| 6045 |
+
{
|
| 6046 |
+
"epoch": 0.06558523894888724,
|
| 6047 |
+
"grad_norm": 35.0,
|
| 6048 |
+
"learning_rate": 0.0009993364128899672,
|
| 6049 |
+
"loss": 33.5705,
|
| 6050 |
+
"step": 21000
|
| 6051 |
+
},
|
| 6052 |
+
{
|
| 6053 |
+
"epoch": 0.06558523894888724,
|
| 6054 |
+
"eval_loss": 33.73247146606445,
|
| 6055 |
+
"eval_runtime": 102.3252,
|
| 6056 |
+
"eval_samples_per_second": 50.848,
|
| 6057 |
+
"eval_steps_per_second": 3.186,
|
| 6058 |
+
"step": 21000
|
| 6059 |
}
|
| 6060 |
],
|
| 6061 |
"logging_steps": 25,
|
|
|
|
| 6075 |
"attributes": {}
|
| 6076 |
}
|
| 6077 |
},
|
| 6078 |
+
"total_flos": 2.663111367480836e+18,
|
| 6079 |
"train_batch_size": 8,
|
| 6080 |
"trial_name": null,
|
| 6081 |
"trial_params": null
|