Training in progress, step 50000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c9956ceaa01a8262c17e82fea9ac349503f1643baa686fe83baf73d6c182cfd
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9a8f8b0ff9c7ab62e432b714de9517f6859e2ebcb731ff15954b08eab3fa5fd
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08d6a67f7616cccd33f77a5e076df0611e7b35eb8ba28bbeb4122e81eca5afa0
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66168c288d1955c1c664cfa64be79d9023fb79ca5529a1e6b201d572885b2dfe
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8730,11 +8730,189 @@
|
|
| 8730 |
"eval_steps_per_second": 23.197,
|
| 8731 |
"num_input_tokens_seen": 12845051456,
|
| 8732 |
"step": 49000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8733 |
}
|
| 8734 |
],
|
| 8735 |
"logging_steps": 50,
|
| 8736 |
"max_steps": 70000,
|
| 8737 |
-
"num_input_tokens_seen":
|
| 8738 |
"num_train_epochs": 1,
|
| 8739 |
"save_steps": 1000,
|
| 8740 |
"stateful_callbacks": {
|
|
@@ -8749,7 +8927,7 @@
|
|
| 8749 |
"attributes": {}
|
| 8750 |
}
|
| 8751 |
},
|
| 8752 |
-
"total_flos": 3.
|
| 8753 |
"train_batch_size": 64,
|
| 8754 |
"trial_name": null,
|
| 8755 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.23850125809413644,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 50000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8730 |
"eval_steps_per_second": 23.197,
|
| 8731 |
"num_input_tokens_seen": 12845051456,
|
| 8732 |
"step": 49000
|
| 8733 |
+
},
|
| 8734 |
+
{
|
| 8735 |
+
"epoch": 0.23396973419034786,
|
| 8736 |
+
"grad_norm": 0.209337517619133,
|
| 8737 |
+
"learning_rate": 0.001,
|
| 8738 |
+
"loss": 2.6348,
|
| 8739 |
+
"num_input_tokens_seen": 12858158656,
|
| 8740 |
+
"step": 49050
|
| 8741 |
+
},
|
| 8742 |
+
{
|
| 8743 |
+
"epoch": 0.234208235448442,
|
| 8744 |
+
"grad_norm": 0.1974038928747177,
|
| 8745 |
+
"learning_rate": 0.001,
|
| 8746 |
+
"loss": 2.6158,
|
| 8747 |
+
"num_input_tokens_seen": 12871265856,
|
| 8748 |
+
"step": 49100
|
| 8749 |
+
},
|
| 8750 |
+
{
|
| 8751 |
+
"epoch": 0.23444673670653612,
|
| 8752 |
+
"grad_norm": 0.28099164366722107,
|
| 8753 |
+
"learning_rate": 0.001,
|
| 8754 |
+
"loss": 2.6101,
|
| 8755 |
+
"num_input_tokens_seen": 12884373056,
|
| 8756 |
+
"step": 49150
|
| 8757 |
+
},
|
| 8758 |
+
{
|
| 8759 |
+
"epoch": 0.23468523796463026,
|
| 8760 |
+
"grad_norm": 0.2172873318195343,
|
| 8761 |
+
"learning_rate": 0.001,
|
| 8762 |
+
"loss": 2.596,
|
| 8763 |
+
"num_input_tokens_seen": 12897480256,
|
| 8764 |
+
"step": 49200
|
| 8765 |
+
},
|
| 8766 |
+
{
|
| 8767 |
+
"epoch": 0.2349237392227244,
|
| 8768 |
+
"grad_norm": 0.2120896875858307,
|
| 8769 |
+
"learning_rate": 0.001,
|
| 8770 |
+
"loss": 2.5994,
|
| 8771 |
+
"num_input_tokens_seen": 12910587456,
|
| 8772 |
+
"step": 49250
|
| 8773 |
+
},
|
| 8774 |
+
{
|
| 8775 |
+
"epoch": 0.23516224048081855,
|
| 8776 |
+
"grad_norm": 0.20109935104846954,
|
| 8777 |
+
"learning_rate": 0.001,
|
| 8778 |
+
"loss": 2.6101,
|
| 8779 |
+
"num_input_tokens_seen": 12923694656,
|
| 8780 |
+
"step": 49300
|
| 8781 |
+
},
|
| 8782 |
+
{
|
| 8783 |
+
"epoch": 0.23540074173891268,
|
| 8784 |
+
"grad_norm": 0.20735585689544678,
|
| 8785 |
+
"learning_rate": 0.001,
|
| 8786 |
+
"loss": 2.6142,
|
| 8787 |
+
"num_input_tokens_seen": 12936801856,
|
| 8788 |
+
"step": 49350
|
| 8789 |
+
},
|
| 8790 |
+
{
|
| 8791 |
+
"epoch": 0.2356392429970068,
|
| 8792 |
+
"grad_norm": 0.21295137703418732,
|
| 8793 |
+
"learning_rate": 0.001,
|
| 8794 |
+
"loss": 2.6226,
|
| 8795 |
+
"num_input_tokens_seen": 12949909056,
|
| 8796 |
+
"step": 49400
|
| 8797 |
+
},
|
| 8798 |
+
{
|
| 8799 |
+
"epoch": 0.23587774425510094,
|
| 8800 |
+
"grad_norm": 0.20560845732688904,
|
| 8801 |
+
"learning_rate": 0.001,
|
| 8802 |
+
"loss": 2.6027,
|
| 8803 |
+
"num_input_tokens_seen": 12963016256,
|
| 8804 |
+
"step": 49450
|
| 8805 |
+
},
|
| 8806 |
+
{
|
| 8807 |
+
"epoch": 0.23611624551319507,
|
| 8808 |
+
"grad_norm": 0.33747321367263794,
|
| 8809 |
+
"learning_rate": 0.001,
|
| 8810 |
+
"loss": 2.6231,
|
| 8811 |
+
"num_input_tokens_seen": 12976123456,
|
| 8812 |
+
"step": 49500
|
| 8813 |
+
},
|
| 8814 |
+
{
|
| 8815 |
+
"epoch": 0.23611624551319507,
|
| 8816 |
+
"eval_loss": 2.5008058547973633,
|
| 8817 |
+
"eval_runtime": 54.2104,
|
| 8818 |
+
"eval_samples_per_second": 92.233,
|
| 8819 |
+
"eval_steps_per_second": 23.058,
|
| 8820 |
+
"num_input_tokens_seen": 12976123456,
|
| 8821 |
+
"step": 49500
|
| 8822 |
+
},
|
| 8823 |
+
{
|
| 8824 |
+
"epoch": 0.23635474677128923,
|
| 8825 |
+
"grad_norm": 0.24593485891819,
|
| 8826 |
+
"learning_rate": 0.001,
|
| 8827 |
+
"loss": 2.6336,
|
| 8828 |
+
"num_input_tokens_seen": 12989230656,
|
| 8829 |
+
"step": 49550
|
| 8830 |
+
},
|
| 8831 |
+
{
|
| 8832 |
+
"epoch": 0.23659324802938336,
|
| 8833 |
+
"grad_norm": 0.25253933668136597,
|
| 8834 |
+
"learning_rate": 0.001,
|
| 8835 |
+
"loss": 2.643,
|
| 8836 |
+
"num_input_tokens_seen": 13002337856,
|
| 8837 |
+
"step": 49600
|
| 8838 |
+
},
|
| 8839 |
+
{
|
| 8840 |
+
"epoch": 0.2368317492874775,
|
| 8841 |
+
"grad_norm": 0.24231670796871185,
|
| 8842 |
+
"learning_rate": 0.001,
|
| 8843 |
+
"loss": 2.6074,
|
| 8844 |
+
"num_input_tokens_seen": 13015445056,
|
| 8845 |
+
"step": 49650
|
| 8846 |
+
},
|
| 8847 |
+
{
|
| 8848 |
+
"epoch": 0.23707025054557163,
|
| 8849 |
+
"grad_norm": 0.2178962677717209,
|
| 8850 |
+
"learning_rate": 0.001,
|
| 8851 |
+
"loss": 2.6184,
|
| 8852 |
+
"num_input_tokens_seen": 13028552256,
|
| 8853 |
+
"step": 49700
|
| 8854 |
+
},
|
| 8855 |
+
{
|
| 8856 |
+
"epoch": 0.23730875180366576,
|
| 8857 |
+
"grad_norm": 0.2651260793209076,
|
| 8858 |
+
"learning_rate": 0.001,
|
| 8859 |
+
"loss": 2.6335,
|
| 8860 |
+
"num_input_tokens_seen": 13041659456,
|
| 8861 |
+
"step": 49750
|
| 8862 |
+
},
|
| 8863 |
+
{
|
| 8864 |
+
"epoch": 0.2375472530617599,
|
| 8865 |
+
"grad_norm": 0.1909639537334442,
|
| 8866 |
+
"learning_rate": 0.001,
|
| 8867 |
+
"loss": 2.61,
|
| 8868 |
+
"num_input_tokens_seen": 13054766656,
|
| 8869 |
+
"step": 49800
|
| 8870 |
+
},
|
| 8871 |
+
{
|
| 8872 |
+
"epoch": 0.23778575431985405,
|
| 8873 |
+
"grad_norm": 0.21107855439186096,
|
| 8874 |
+
"learning_rate": 0.001,
|
| 8875 |
+
"loss": 2.6333,
|
| 8876 |
+
"num_input_tokens_seen": 13067873856,
|
| 8877 |
+
"step": 49850
|
| 8878 |
+
},
|
| 8879 |
+
{
|
| 8880 |
+
"epoch": 0.23802425557794818,
|
| 8881 |
+
"grad_norm": 0.19366736710071564,
|
| 8882 |
+
"learning_rate": 0.001,
|
| 8883 |
+
"loss": 2.6068,
|
| 8884 |
+
"num_input_tokens_seen": 13080981056,
|
| 8885 |
+
"step": 49900
|
| 8886 |
+
},
|
| 8887 |
+
{
|
| 8888 |
+
"epoch": 0.2382627568360423,
|
| 8889 |
+
"grad_norm": 0.2851523458957672,
|
| 8890 |
+
"learning_rate": 0.001,
|
| 8891 |
+
"loss": 2.6183,
|
| 8892 |
+
"num_input_tokens_seen": 13094088256,
|
| 8893 |
+
"step": 49950
|
| 8894 |
+
},
|
| 8895 |
+
{
|
| 8896 |
+
"epoch": 0.23850125809413644,
|
| 8897 |
+
"grad_norm": 0.23617912828922272,
|
| 8898 |
+
"learning_rate": 0.001,
|
| 8899 |
+
"loss": 2.617,
|
| 8900 |
+
"num_input_tokens_seen": 13107195456,
|
| 8901 |
+
"step": 50000
|
| 8902 |
+
},
|
| 8903 |
+
{
|
| 8904 |
+
"epoch": 0.23850125809413644,
|
| 8905 |
+
"eval_loss": 2.497406005859375,
|
| 8906 |
+
"eval_runtime": 53.6538,
|
| 8907 |
+
"eval_samples_per_second": 93.19,
|
| 8908 |
+
"eval_steps_per_second": 23.298,
|
| 8909 |
+
"num_input_tokens_seen": 13107195456,
|
| 8910 |
+
"step": 50000
|
| 8911 |
}
|
| 8912 |
],
|
| 8913 |
"logging_steps": 50,
|
| 8914 |
"max_steps": 70000,
|
| 8915 |
+
"num_input_tokens_seen": 13107195456,
|
| 8916 |
"num_train_epochs": 1,
|
| 8917 |
"save_steps": 1000,
|
| 8918 |
"stateful_callbacks": {
|
|
|
|
| 8927 |
"attributes": {}
|
| 8928 |
}
|
| 8929 |
},
|
| 8930 |
+
"total_flos": 3.5063027107076506e+18,
|
| 8931 |
"train_batch_size": 64,
|
| 8932 |
"trial_name": null,
|
| 8933 |
"trial_params": null
|