Training in progress, step 39000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92d9de77e169ab6755eee5c1e2686926ed90a44667e2ca5eaaa214d0bfb470d0
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32d052133c3436978400ad1196231762f190f602db77149c4cd7cd33bf55ce04
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25ff3bb999ac5f8f98fec3e5d0ee521c3ada6460eb2706bfa5386f4fa0d04e58
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0453f791eed05815dae518781dc172eec5529318c2577a889d73a15d6a871e53
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6772,11 +6772,189 @@
|
|
| 6772 |
"eval_steps_per_second": 18.833,
|
| 6773 |
"num_input_tokens_seen": 39845884160,
|
| 6774 |
"step": 38000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6775 |
}
|
| 6776 |
],
|
| 6777 |
"logging_steps": 50,
|
| 6778 |
"max_steps": 200000,
|
| 6779 |
-
"num_input_tokens_seen":
|
| 6780 |
"num_train_epochs": 5,
|
| 6781 |
"save_steps": 1000,
|
| 6782 |
"stateful_callbacks": {
|
|
@@ -6791,7 +6969,7 @@
|
|
| 6791 |
"attributes": {}
|
| 6792 |
}
|
| 6793 |
},
|
| 6794 |
-
"total_flos": 2.
|
| 6795 |
"train_batch_size": 64,
|
| 6796 |
"trial_name": null,
|
| 6797 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.8566756864731733,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 39000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6772 |
"eval_steps_per_second": 18.833,
|
| 6773 |
"num_input_tokens_seen": 39845884160,
|
| 6774 |
"step": 38000
|
| 6775 |
+
},
|
| 6776 |
+
{
|
| 6777 |
+
"epoch": 0.8358079453924165,
|
| 6778 |
+
"grad_norm": 0.15346269309520721,
|
| 6779 |
+
"learning_rate": 0.001,
|
| 6780 |
+
"loss": 2.645,
|
| 6781 |
+
"num_input_tokens_seen": 39898312960,
|
| 6782 |
+
"step": 38050
|
| 6783 |
+
},
|
| 6784 |
+
{
|
| 6785 |
+
"epoch": 0.8369062475545616,
|
| 6786 |
+
"grad_norm": 0.1504630148410797,
|
| 6787 |
+
"learning_rate": 0.001,
|
| 6788 |
+
"loss": 2.666,
|
| 6789 |
+
"num_input_tokens_seen": 39950741760,
|
| 6790 |
+
"step": 38100
|
| 6791 |
+
},
|
| 6792 |
+
{
|
| 6793 |
+
"epoch": 0.8380045497167067,
|
| 6794 |
+
"grad_norm": 0.19098903238773346,
|
| 6795 |
+
"learning_rate": 0.001,
|
| 6796 |
+
"loss": 2.6649,
|
| 6797 |
+
"num_input_tokens_seen": 40003170560,
|
| 6798 |
+
"step": 38150
|
| 6799 |
+
},
|
| 6800 |
+
{
|
| 6801 |
+
"epoch": 0.8391028518788518,
|
| 6802 |
+
"grad_norm": 0.15553973615169525,
|
| 6803 |
+
"learning_rate": 0.001,
|
| 6804 |
+
"loss": 2.6565,
|
| 6805 |
+
"num_input_tokens_seen": 40055599360,
|
| 6806 |
+
"step": 38200
|
| 6807 |
+
},
|
| 6808 |
+
{
|
| 6809 |
+
"epoch": 0.8402011540409968,
|
| 6810 |
+
"grad_norm": 0.15650159120559692,
|
| 6811 |
+
"learning_rate": 0.001,
|
| 6812 |
+
"loss": 2.6568,
|
| 6813 |
+
"num_input_tokens_seen": 40108028160,
|
| 6814 |
+
"step": 38250
|
| 6815 |
+
},
|
| 6816 |
+
{
|
| 6817 |
+
"epoch": 0.841299456203142,
|
| 6818 |
+
"grad_norm": 0.17787836492061615,
|
| 6819 |
+
"learning_rate": 0.001,
|
| 6820 |
+
"loss": 2.6497,
|
| 6821 |
+
"num_input_tokens_seen": 40160456960,
|
| 6822 |
+
"step": 38300
|
| 6823 |
+
},
|
| 6824 |
+
{
|
| 6825 |
+
"epoch": 0.8423977583652871,
|
| 6826 |
+
"grad_norm": 0.1535162478685379,
|
| 6827 |
+
"learning_rate": 0.001,
|
| 6828 |
+
"loss": 2.6492,
|
| 6829 |
+
"num_input_tokens_seen": 40212885760,
|
| 6830 |
+
"step": 38350
|
| 6831 |
+
},
|
| 6832 |
+
{
|
| 6833 |
+
"epoch": 0.8434960605274322,
|
| 6834 |
+
"grad_norm": 0.16713359951972961,
|
| 6835 |
+
"learning_rate": 0.001,
|
| 6836 |
+
"loss": 2.6534,
|
| 6837 |
+
"num_input_tokens_seen": 40265314560,
|
| 6838 |
+
"step": 38400
|
| 6839 |
+
},
|
| 6840 |
+
{
|
| 6841 |
+
"epoch": 0.8445943626895772,
|
| 6842 |
+
"grad_norm": 0.17087998986244202,
|
| 6843 |
+
"learning_rate": 0.001,
|
| 6844 |
+
"loss": 2.6602,
|
| 6845 |
+
"num_input_tokens_seen": 40317743360,
|
| 6846 |
+
"step": 38450
|
| 6847 |
+
},
|
| 6848 |
+
{
|
| 6849 |
+
"epoch": 0.8456926648517223,
|
| 6850 |
+
"grad_norm": 0.15651412308216095,
|
| 6851 |
+
"learning_rate": 0.001,
|
| 6852 |
+
"loss": 2.6547,
|
| 6853 |
+
"num_input_tokens_seen": 40370172160,
|
| 6854 |
+
"step": 38500
|
| 6855 |
+
},
|
| 6856 |
+
{
|
| 6857 |
+
"epoch": 0.8456926648517223,
|
| 6858 |
+
"eval_loss": 2.5524706840515137,
|
| 6859 |
+
"eval_runtime": 66.5023,
|
| 6860 |
+
"eval_samples_per_second": 75.185,
|
| 6861 |
+
"eval_steps_per_second": 18.796,
|
| 6862 |
+
"num_input_tokens_seen": 40370172160,
|
| 6863 |
+
"step": 38500
|
| 6864 |
+
},
|
| 6865 |
+
{
|
| 6866 |
+
"epoch": 0.8467909670138675,
|
| 6867 |
+
"grad_norm": 0.15205898880958557,
|
| 6868 |
+
"learning_rate": 0.001,
|
| 6869 |
+
"loss": 2.6541,
|
| 6870 |
+
"num_input_tokens_seen": 40422600960,
|
| 6871 |
+
"step": 38550
|
| 6872 |
+
},
|
| 6873 |
+
{
|
| 6874 |
+
"epoch": 0.8478892691760125,
|
| 6875 |
+
"grad_norm": 0.15865832567214966,
|
| 6876 |
+
"learning_rate": 0.001,
|
| 6877 |
+
"loss": 2.6536,
|
| 6878 |
+
"num_input_tokens_seen": 40475029760,
|
| 6879 |
+
"step": 38600
|
| 6880 |
+
},
|
| 6881 |
+
{
|
| 6882 |
+
"epoch": 0.8489875713381576,
|
| 6883 |
+
"grad_norm": 0.133284330368042,
|
| 6884 |
+
"learning_rate": 0.001,
|
| 6885 |
+
"loss": 2.6531,
|
| 6886 |
+
"num_input_tokens_seen": 40527458560,
|
| 6887 |
+
"step": 38650
|
| 6888 |
+
},
|
| 6889 |
+
{
|
| 6890 |
+
"epoch": 0.8500858735003027,
|
| 6891 |
+
"grad_norm": 0.1421806663274765,
|
| 6892 |
+
"learning_rate": 0.001,
|
| 6893 |
+
"loss": 2.6558,
|
| 6894 |
+
"num_input_tokens_seen": 40579887360,
|
| 6895 |
+
"step": 38700
|
| 6896 |
+
},
|
| 6897 |
+
{
|
| 6898 |
+
"epoch": 0.8511841756624479,
|
| 6899 |
+
"grad_norm": 0.19429996609687805,
|
| 6900 |
+
"learning_rate": 0.001,
|
| 6901 |
+
"loss": 2.6628,
|
| 6902 |
+
"num_input_tokens_seen": 40632316160,
|
| 6903 |
+
"step": 38750
|
| 6904 |
+
},
|
| 6905 |
+
{
|
| 6906 |
+
"epoch": 0.8522824778245929,
|
| 6907 |
+
"grad_norm": 0.14661937952041626,
|
| 6908 |
+
"learning_rate": 0.001,
|
| 6909 |
+
"loss": 2.6594,
|
| 6910 |
+
"num_input_tokens_seen": 40684744960,
|
| 6911 |
+
"step": 38800
|
| 6912 |
+
},
|
| 6913 |
+
{
|
| 6914 |
+
"epoch": 0.853380779986738,
|
| 6915 |
+
"grad_norm": 0.1694687008857727,
|
| 6916 |
+
"learning_rate": 0.001,
|
| 6917 |
+
"loss": 2.6571,
|
| 6918 |
+
"num_input_tokens_seen": 40737173760,
|
| 6919 |
+
"step": 38850
|
| 6920 |
+
},
|
| 6921 |
+
{
|
| 6922 |
+
"epoch": 0.8544790821488831,
|
| 6923 |
+
"grad_norm": 0.152188241481781,
|
| 6924 |
+
"learning_rate": 0.001,
|
| 6925 |
+
"loss": 2.6534,
|
| 6926 |
+
"num_input_tokens_seen": 40789602560,
|
| 6927 |
+
"step": 38900
|
| 6928 |
+
},
|
| 6929 |
+
{
|
| 6930 |
+
"epoch": 0.8555773843110281,
|
| 6931 |
+
"grad_norm": 0.1554640680551529,
|
| 6932 |
+
"learning_rate": 0.001,
|
| 6933 |
+
"loss": 2.649,
|
| 6934 |
+
"num_input_tokens_seen": 40842031360,
|
| 6935 |
+
"step": 38950
|
| 6936 |
+
},
|
| 6937 |
+
{
|
| 6938 |
+
"epoch": 0.8566756864731733,
|
| 6939 |
+
"grad_norm": 0.1481955647468567,
|
| 6940 |
+
"learning_rate": 0.001,
|
| 6941 |
+
"loss": 2.6527,
|
| 6942 |
+
"num_input_tokens_seen": 40894460160,
|
| 6943 |
+
"step": 39000
|
| 6944 |
+
},
|
| 6945 |
+
{
|
| 6946 |
+
"epoch": 0.8566756864731733,
|
| 6947 |
+
"eval_loss": 2.547664165496826,
|
| 6948 |
+
"eval_runtime": 66.2874,
|
| 6949 |
+
"eval_samples_per_second": 75.429,
|
| 6950 |
+
"eval_steps_per_second": 18.857,
|
| 6951 |
+
"num_input_tokens_seen": 40894460160,
|
| 6952 |
+
"step": 39000
|
| 6953 |
}
|
| 6954 |
],
|
| 6955 |
"logging_steps": 50,
|
| 6956 |
"max_steps": 200000,
|
| 6957 |
+
"num_input_tokens_seen": 40894460160,
|
| 6958 |
"num_train_epochs": 5,
|
| 6959 |
"save_steps": 1000,
|
| 6960 |
"stateful_callbacks": {
|
|
|
|
| 6969 |
"attributes": {}
|
| 6970 |
}
|
| 6971 |
},
|
| 6972 |
+
"total_flos": 2.3289694735724052e+19,
|
| 6973 |
"train_batch_size": 64,
|
| 6974 |
"trial_name": null,
|
| 6975 |
"trial_params": null
|