Training in progress, step 62000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbbf7b607c85a5d696bff54af0adb9f239d76d76446306b0d75e85fb86338432
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3561f0a9213e3ac9e43eff9c9d946a42b171ff83db0a3806965305d6e1bbe28a
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b604bf86b8b70beb6e4043604c61f8577f1fbe75a9d1e20249b5622ec5aa2654
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68dfbb60d9dcf18c45914087cca91dc6c214da7f11269c4a414921902f313d06
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10866,11 +10866,189 @@
|
|
| 10866 |
"eval_steps_per_second": 23.996,
|
| 10867 |
"num_input_tokens_seen": 15990784000,
|
| 10868 |
"step": 61000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10869 |
}
|
| 10870 |
],
|
| 10871 |
"logging_steps": 50,
|
| 10872 |
"max_steps": 70000,
|
| 10873 |
-
"num_input_tokens_seen":
|
| 10874 |
"num_train_epochs": 1,
|
| 10875 |
"save_steps": 1000,
|
| 10876 |
"stateful_callbacks": {
|
|
@@ -10885,7 +11063,7 @@
|
|
| 10885 |
"attributes": {}
|
| 10886 |
}
|
| 10887 |
},
|
| 10888 |
-
"total_flos": 4.
|
| 10889 |
"train_batch_size": 64,
|
| 10890 |
"trial_name": null,
|
| 10891 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.41704293019743954,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 62000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10866 |
"eval_steps_per_second": 23.996,
|
| 10867 |
"num_input_tokens_seen": 15990784000,
|
| 10868 |
"step": 61000
|
| 10869 |
+
},
|
| 10870 |
+
{
|
| 10871 |
+
"epoch": 0.41065275626699493,
|
| 10872 |
+
"grad_norm": 0.3535885810852051,
|
| 10873 |
+
"learning_rate": 0.0005927261190557954,
|
| 10874 |
+
"loss": 3.0102,
|
| 10875 |
+
"num_input_tokens_seen": 16003891200,
|
| 10876 |
+
"step": 61050
|
| 10877 |
+
},
|
| 10878 |
+
{
|
| 10879 |
+
"epoch": 0.41098908121070254,
|
| 10880 |
+
"grad_norm": 0.2633107304573059,
|
| 10881 |
+
"learning_rate": 0.0005878981399671774,
|
| 10882 |
+
"loss": 3.0424,
|
| 10883 |
+
"num_input_tokens_seen": 16016998400,
|
| 10884 |
+
"step": 61100
|
| 10885 |
+
},
|
| 10886 |
+
{
|
| 10887 |
+
"epoch": 0.41132540615441016,
|
| 10888 |
+
"grad_norm": 0.3054018020629883,
|
| 10889 |
+
"learning_rate": 0.0005830616890728827,
|
| 10890 |
+
"loss": 3.0233,
|
| 10891 |
+
"num_input_tokens_seen": 16030105600,
|
| 10892 |
+
"step": 61150
|
| 10893 |
+
},
|
| 10894 |
+
{
|
| 10895 |
+
"epoch": 0.41166173109811777,
|
| 10896 |
+
"grad_norm": 0.21453993022441864,
|
| 10897 |
+
"learning_rate": 0.0005782172325201155,
|
| 10898 |
+
"loss": 3.018,
|
| 10899 |
+
"num_input_tokens_seen": 16043212800,
|
| 10900 |
+
"step": 61200
|
| 10901 |
+
},
|
| 10902 |
+
{
|
| 10903 |
+
"epoch": 0.4119980560418254,
|
| 10904 |
+
"grad_norm": 0.27815598249435425,
|
| 10905 |
+
"learning_rate": 0.0005733652372276809,
|
| 10906 |
+
"loss": 3.0254,
|
| 10907 |
+
"num_input_tokens_seen": 16056320000,
|
| 10908 |
+
"step": 61250
|
| 10909 |
+
},
|
| 10910 |
+
{
|
| 10911 |
+
"epoch": 0.412334380985533,
|
| 10912 |
+
"grad_norm": 0.20687313377857208,
|
| 10913 |
+
"learning_rate": 0.0005685061708409841,
|
| 10914 |
+
"loss": 3.0165,
|
| 10915 |
+
"num_input_tokens_seen": 16069427200,
|
| 10916 |
+
"step": 61300
|
| 10917 |
+
},
|
| 10918 |
+
{
|
| 10919 |
+
"epoch": 0.4126707059292406,
|
| 10920 |
+
"grad_norm": 0.1985252946615219,
|
| 10921 |
+
"learning_rate": 0.0005636405016869566,
|
| 10922 |
+
"loss": 3.0164,
|
| 10923 |
+
"num_input_tokens_seen": 16082534400,
|
| 10924 |
+
"step": 61350
|
| 10925 |
+
},
|
| 10926 |
+
{
|
| 10927 |
+
"epoch": 0.4130070308729482,
|
| 10928 |
+
"grad_norm": 0.26703181862831116,
|
| 10929 |
+
"learning_rate": 0.0005587686987289189,
|
| 10930 |
+
"loss": 3.0001,
|
| 10931 |
+
"num_input_tokens_seen": 16095641600,
|
| 10932 |
+
"step": 61400
|
| 10933 |
+
},
|
| 10934 |
+
{
|
| 10935 |
+
"epoch": 0.4133433558166558,
|
| 10936 |
+
"grad_norm": 0.1948036104440689,
|
| 10937 |
+
"learning_rate": 0.0005538912315213797,
|
| 10938 |
+
"loss": 3.0058,
|
| 10939 |
+
"num_input_tokens_seen": 16108748800,
|
| 10940 |
+
"step": 61450
|
| 10941 |
+
},
|
| 10942 |
+
{
|
| 10943 |
+
"epoch": 0.41367968076036343,
|
| 10944 |
+
"grad_norm": 0.20653308928012848,
|
| 10945 |
+
"learning_rate": 0.0005490085701647804,
|
| 10946 |
+
"loss": 3.0115,
|
| 10947 |
+
"num_input_tokens_seen": 16121856000,
|
| 10948 |
+
"step": 61500
|
| 10949 |
+
},
|
| 10950 |
+
{
|
| 10951 |
+
"epoch": 0.41367968076036343,
|
| 10952 |
+
"eval_loss": 2.9048781394958496,
|
| 10953 |
+
"eval_runtime": 53.8207,
|
| 10954 |
+
"eval_samples_per_second": 92.901,
|
| 10955 |
+
"eval_steps_per_second": 23.225,
|
| 10956 |
+
"num_input_tokens_seen": 16121856000,
|
| 10957 |
+
"step": 61500
|
| 10958 |
+
},
|
| 10959 |
+
{
|
| 10960 |
+
"epoch": 0.41401600570407104,
|
| 10961 |
+
"grad_norm": 0.19605295360088348,
|
| 10962 |
+
"learning_rate": 0.0005441211852601849,
|
| 10963 |
+
"loss": 3.0225,
|
| 10964 |
+
"num_input_tokens_seen": 16134963200,
|
| 10965 |
+
"step": 61550
|
| 10966 |
+
},
|
| 10967 |
+
{
|
| 10968 |
+
"epoch": 0.41435233064777865,
|
| 10969 |
+
"grad_norm": 0.17526155710220337,
|
| 10970 |
+
"learning_rate": 0.0005392295478639225,
|
| 10971 |
+
"loss": 3.0117,
|
| 10972 |
+
"num_input_tokens_seen": 16148070400,
|
| 10973 |
+
"step": 61600
|
| 10974 |
+
},
|
| 10975 |
+
{
|
| 10976 |
+
"epoch": 0.41468865559148627,
|
| 10977 |
+
"grad_norm": 0.17657403647899628,
|
| 10978 |
+
"learning_rate": 0.0005343341294421868,
|
| 10979 |
+
"loss": 3.0107,
|
| 10980 |
+
"num_input_tokens_seen": 16161177600,
|
| 10981 |
+
"step": 61650
|
| 10982 |
+
},
|
| 10983 |
+
{
|
| 10984 |
+
"epoch": 0.4150249805351939,
|
| 10985 |
+
"grad_norm": 0.18658681213855743,
|
| 10986 |
+
"learning_rate": 0.0005294354018255945,
|
| 10987 |
+
"loss": 3.0085,
|
| 10988 |
+
"num_input_tokens_seen": 16174284800,
|
| 10989 |
+
"step": 61700
|
| 10990 |
+
},
|
| 10991 |
+
{
|
| 10992 |
+
"epoch": 0.4153613054789015,
|
| 10993 |
+
"grad_norm": 0.24781519174575806,
|
| 10994 |
+
"learning_rate": 0.0005245338371637091,
|
| 10995 |
+
"loss": 2.9939,
|
| 10996 |
+
"num_input_tokens_seen": 16187392000,
|
| 10997 |
+
"step": 61750
|
| 10998 |
+
},
|
| 10999 |
+
{
|
| 11000 |
+
"epoch": 0.4156976304226091,
|
| 11001 |
+
"grad_norm": 0.20824941992759705,
|
| 11002 |
+
"learning_rate": 0.0005196299078795343,
|
| 11003 |
+
"loss": 3.0038,
|
| 11004 |
+
"num_input_tokens_seen": 16200499200,
|
| 11005 |
+
"step": 61800
|
| 11006 |
+
},
|
| 11007 |
+
{
|
| 11008 |
+
"epoch": 0.4160339553663167,
|
| 11009 |
+
"grad_norm": 0.38262441754341125,
|
| 11010 |
+
"learning_rate": 0.0005147240866239817,
|
| 11011 |
+
"loss": 3.0141,
|
| 11012 |
+
"num_input_tokens_seen": 16213606400,
|
| 11013 |
+
"step": 61850
|
| 11014 |
+
},
|
| 11015 |
+
{
|
| 11016 |
+
"epoch": 0.4163702803100243,
|
| 11017 |
+
"grad_norm": 0.200628861784935,
|
| 11018 |
+
"learning_rate": 0.0005098168462303141,
|
| 11019 |
+
"loss": 3.0187,
|
| 11020 |
+
"num_input_tokens_seen": 16226713600,
|
| 11021 |
+
"step": 61900
|
| 11022 |
+
},
|
| 11023 |
+
{
|
| 11024 |
+
"epoch": 0.41670660525373193,
|
| 11025 |
+
"grad_norm": 0.18858259916305542,
|
| 11026 |
+
"learning_rate": 0.000504908659668575,
|
| 11027 |
+
"loss": 3.0049,
|
| 11028 |
+
"num_input_tokens_seen": 16239820800,
|
| 11029 |
+
"step": 61950
|
| 11030 |
+
},
|
| 11031 |
+
{
|
| 11032 |
+
"epoch": 0.41704293019743954,
|
| 11033 |
+
"grad_norm": 0.19025108218193054,
|
| 11034 |
+
"learning_rate": 0.0005,
|
| 11035 |
+
"loss": 3.0079,
|
| 11036 |
+
"num_input_tokens_seen": 16252928000,
|
| 11037 |
+
"step": 62000
|
| 11038 |
+
},
|
| 11039 |
+
{
|
| 11040 |
+
"epoch": 0.41704293019743954,
|
| 11041 |
+
"eval_loss": 2.9012608528137207,
|
| 11042 |
+
"eval_runtime": 52.7052,
|
| 11043 |
+
"eval_samples_per_second": 94.867,
|
| 11044 |
+
"eval_steps_per_second": 23.717,
|
| 11045 |
+
"num_input_tokens_seen": 16252928000,
|
| 11046 |
+
"step": 62000
|
| 11047 |
}
|
| 11048 |
],
|
| 11049 |
"logging_steps": 50,
|
| 11050 |
"max_steps": 70000,
|
| 11051 |
+
"num_input_tokens_seen": 16252928000,
|
| 11052 |
"num_train_epochs": 1,
|
| 11053 |
"save_steps": 1000,
|
| 11054 |
"stateful_callbacks": {
|
|
|
|
| 11063 |
"attributes": {}
|
| 11064 |
}
|
| 11065 |
},
|
| 11066 |
+
"total_flos": 4.34781686857728e+18,
|
| 11067 |
"train_batch_size": 64,
|
| 11068 |
"trial_name": null,
|
| 11069 |
"trial_params": null
|