Training in progress, step 124000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6da59dcdd189ba50995bfaca6dfb3c1f07cec1d39f2b04e6b589b61aa33bf008
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0a4aebfce2ae0e56c21f66beb3519294df5637c5928eb84133802b9a02f01ec
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98f0ee0fd151b13dc8525e6639746bb04660a2a355f86970459a4f08c593ef0a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4c196ff888110afc03a5fac8e049987b043db46c6b51b50b9a63aa8569f2b7f
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -21902,11 +21902,189 @@
|
|
| 21902 |
"eval_steps_per_second": 15.061,
|
| 21903 |
"num_input_tokens_seen": 64477051392,
|
| 21904 |
"step": 123000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21905 |
}
|
| 21906 |
],
|
| 21907 |
"logging_steps": 50,
|
| 21908 |
"max_steps": 140000,
|
| 21909 |
-
"num_input_tokens_seen":
|
| 21910 |
"num_train_epochs": 2,
|
| 21911 |
"save_steps": 1000,
|
| 21912 |
"stateful_callbacks": {
|
|
@@ -21921,7 +22099,7 @@
|
|
| 21921 |
"attributes": {}
|
| 21922 |
}
|
| 21923 |
},
|
| 21924 |
-
"total_flos": 1.
|
| 21925 |
"train_batch_size": 32,
|
| 21926 |
"trial_name": null,
|
| 21927 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1829686251594977,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 124000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 21902 |
"eval_steps_per_second": 15.061,
|
| 21903 |
"num_input_tokens_seen": 64477051392,
|
| 21904 |
"step": 123000
|
| 21905 |
+
},
|
| 21906 |
+
{
|
| 21907 |
+
"epoch": 1.1739055773519205,
|
| 21908 |
+
"grad_norm": 0.14069771766662598,
|
| 21909 |
+
"learning_rate": 0.0006624893596897613,
|
| 21910 |
+
"loss": 2.0767,
|
| 21911 |
+
"num_input_tokens_seen": 64503259872,
|
| 21912 |
+
"step": 123050
|
| 21913 |
+
},
|
| 21914 |
+
{
|
| 21915 |
+
"epoch": 1.174382579868109,
|
| 21916 |
+
"grad_norm": 0.14180107414722443,
|
| 21917 |
+
"learning_rate": 0.0006598340745578908,
|
| 21918 |
+
"loss": 2.0611,
|
| 21919 |
+
"num_input_tokens_seen": 64529460896,
|
| 21920 |
+
"step": 123100
|
| 21921 |
+
},
|
| 21922 |
+
{
|
| 21923 |
+
"epoch": 1.174859582384297,
|
| 21924 |
+
"grad_norm": 0.14584094285964966,
|
| 21925 |
+
"learning_rate": 0.000657173759148761,
|
| 21926 |
+
"loss": 2.0693,
|
| 21927 |
+
"num_input_tokens_seen": 64555675296,
|
| 21928 |
+
"step": 123150
|
| 21929 |
+
},
|
| 21930 |
+
{
|
| 21931 |
+
"epoch": 1.1753365849004853,
|
| 21932 |
+
"grad_norm": 0.1269799768924713,
|
| 21933 |
+
"learning_rate": 0.0006545084971874737,
|
| 21934 |
+
"loss": 2.0615,
|
| 21935 |
+
"num_input_tokens_seen": 64581882720,
|
| 21936 |
+
"step": 123200
|
| 21937 |
+
},
|
| 21938 |
+
{
|
| 21939 |
+
"epoch": 1.1758135874166737,
|
| 21940 |
+
"grad_norm": 0.15073458850383759,
|
| 21941 |
+
"learning_rate": 0.0006518383725548074,
|
| 21942 |
+
"loss": 2.083,
|
| 21943 |
+
"num_input_tokens_seen": 64608088736,
|
| 21944 |
+
"step": 123250
|
| 21945 |
+
},
|
| 21946 |
+
{
|
| 21947 |
+
"epoch": 1.176290589932862,
|
| 21948 |
+
"grad_norm": 0.12902715802192688,
|
| 21949 |
+
"learning_rate": 0.000649163469284578,
|
| 21950 |
+
"loss": 2.0579,
|
| 21951 |
+
"num_input_tokens_seen": 64634299936,
|
| 21952 |
+
"step": 123300
|
| 21953 |
+
},
|
| 21954 |
+
{
|
| 21955 |
+
"epoch": 1.1767675924490502,
|
| 21956 |
+
"grad_norm": 0.13666096329689026,
|
| 21957 |
+
"learning_rate": 0.0006464838715609945,
|
| 21958 |
+
"loss": 2.0673,
|
| 21959 |
+
"num_input_tokens_seen": 64660511904,
|
| 21960 |
+
"step": 123350
|
| 21961 |
+
},
|
| 21962 |
+
{
|
| 21963 |
+
"epoch": 1.1772445949652384,
|
| 21964 |
+
"grad_norm": 0.13477379083633423,
|
| 21965 |
+
"learning_rate": 0.0006437996637160086,
|
| 21966 |
+
"loss": 2.0752,
|
| 21967 |
+
"num_input_tokens_seen": 64686718272,
|
| 21968 |
+
"step": 123400
|
| 21969 |
+
},
|
| 21970 |
+
{
|
| 21971 |
+
"epoch": 1.1777215974814268,
|
| 21972 |
+
"grad_norm": 0.13596594333648682,
|
| 21973 |
+
"learning_rate": 0.0006411109302266615,
|
| 21974 |
+
"loss": 2.0606,
|
| 21975 |
+
"num_input_tokens_seen": 64712932256,
|
| 21976 |
+
"step": 123450
|
| 21977 |
+
},
|
| 21978 |
+
{
|
| 21979 |
+
"epoch": 1.178198599997615,
|
| 21980 |
+
"grad_norm": 0.1400011032819748,
|
| 21981 |
+
"learning_rate": 0.0006384177557124247,
|
| 21982 |
+
"loss": 2.066,
|
| 21983 |
+
"num_input_tokens_seen": 64739145440,
|
| 21984 |
+
"step": 123500
|
| 21985 |
+
},
|
| 21986 |
+
{
|
| 21987 |
+
"epoch": 1.178198599997615,
|
| 21988 |
+
"eval_loss": 1.986546516418457,
|
| 21989 |
+
"eval_runtime": 82.7963,
|
| 21990 |
+
"eval_samples_per_second": 60.389,
|
| 21991 |
+
"eval_steps_per_second": 15.097,
|
| 21992 |
+
"num_input_tokens_seen": 64739145440,
|
| 21993 |
+
"step": 123500
|
| 21994 |
+
},
|
| 21995 |
+
{
|
| 21996 |
+
"epoch": 1.1786756025138032,
|
| 21997 |
+
"grad_norm": 0.13023069500923157,
|
| 21998 |
+
"learning_rate": 0.0006357202249325371,
|
| 21999 |
+
"loss": 2.0727,
|
| 22000 |
+
"num_input_tokens_seen": 64765359840,
|
| 22001 |
+
"step": 123550
|
| 22002 |
+
},
|
| 22003 |
+
{
|
| 22004 |
+
"epoch": 1.1791526050299916,
|
| 22005 |
+
"grad_norm": 0.13744056224822998,
|
| 22006 |
+
"learning_rate": 0.0006330184227833376,
|
| 22007 |
+
"loss": 2.0603,
|
| 22008 |
+
"num_input_tokens_seen": 64791573504,
|
| 22009 |
+
"step": 123600
|
| 22010 |
+
},
|
| 22011 |
+
{
|
| 22012 |
+
"epoch": 1.1796296075461798,
|
| 22013 |
+
"grad_norm": 0.1399419903755188,
|
| 22014 |
+
"learning_rate": 0.0006303124342955927,
|
| 22015 |
+
"loss": 2.0699,
|
| 22016 |
+
"num_input_tokens_seen": 64817787904,
|
| 22017 |
+
"step": 123650
|
| 22018 |
+
},
|
| 22019 |
+
{
|
| 22020 |
+
"epoch": 1.180106610062368,
|
| 22021 |
+
"grad_norm": 0.13453304767608643,
|
| 22022 |
+
"learning_rate": 0.0006276023446318213,
|
| 22023 |
+
"loss": 2.0764,
|
| 22024 |
+
"num_input_tokens_seen": 64844002304,
|
| 22025 |
+
"step": 123700
|
| 22026 |
+
},
|
| 22027 |
+
{
|
| 22028 |
+
"epoch": 1.1805836125785563,
|
| 22029 |
+
"grad_norm": 0.13495005667209625,
|
| 22030 |
+
"learning_rate": 0.0006248882390836135,
|
| 22031 |
+
"loss": 2.0629,
|
| 22032 |
+
"num_input_tokens_seen": 64870216704,
|
| 22033 |
+
"step": 123750
|
| 22034 |
+
},
|
| 22035 |
+
{
|
| 22036 |
+
"epoch": 1.1810606150947447,
|
| 22037 |
+
"grad_norm": 0.14330346882343292,
|
| 22038 |
+
"learning_rate": 0.000622170203068947,
|
| 22039 |
+
"loss": 2.0677,
|
| 22040 |
+
"num_input_tokens_seen": 64896426784,
|
| 22041 |
+
"step": 123800
|
| 22042 |
+
},
|
| 22043 |
+
{
|
| 22044 |
+
"epoch": 1.181537617610933,
|
| 22045 |
+
"grad_norm": 0.13179130852222443,
|
| 22046 |
+
"learning_rate": 0.0006194483221294988,
|
| 22047 |
+
"loss": 2.0568,
|
| 22048 |
+
"num_input_tokens_seen": 64922636000,
|
| 22049 |
+
"step": 123850
|
| 22050 |
+
},
|
| 22051 |
+
{
|
| 22052 |
+
"epoch": 1.182014620127121,
|
| 22053 |
+
"grad_norm": 0.12518762052059174,
|
| 22054 |
+
"learning_rate": 0.0006167226819279528,
|
| 22055 |
+
"loss": 2.0604,
|
| 22056 |
+
"num_input_tokens_seen": 64948840416,
|
| 22057 |
+
"step": 123900
|
| 22058 |
+
},
|
| 22059 |
+
{
|
| 22060 |
+
"epoch": 1.1824916226433095,
|
| 22061 |
+
"grad_norm": 0.12823528051376343,
|
| 22062 |
+
"learning_rate": 0.0006139933682453035,
|
| 22063 |
+
"loss": 2.0683,
|
| 22064 |
+
"num_input_tokens_seen": 64975054816,
|
| 22065 |
+
"step": 123950
|
| 22066 |
+
},
|
| 22067 |
+
{
|
| 22068 |
+
"epoch": 1.1829686251594977,
|
| 22069 |
+
"grad_norm": 0.1308305859565735,
|
| 22070 |
+
"learning_rate": 0.0006112604669781572,
|
| 22071 |
+
"loss": 2.0639,
|
| 22072 |
+
"num_input_tokens_seen": 65001257824,
|
| 22073 |
+
"step": 124000
|
| 22074 |
+
},
|
| 22075 |
+
{
|
| 22076 |
+
"epoch": 1.1829686251594977,
|
| 22077 |
+
"eval_loss": 1.9843353033065796,
|
| 22078 |
+
"eval_runtime": 82.7751,
|
| 22079 |
+
"eval_samples_per_second": 60.405,
|
| 22080 |
+
"eval_steps_per_second": 15.101,
|
| 22081 |
+
"num_input_tokens_seen": 65001257824,
|
| 22082 |
+
"step": 124000
|
| 22083 |
}
|
| 22084 |
],
|
| 22085 |
"logging_steps": 50,
|
| 22086 |
"max_steps": 140000,
|
| 22087 |
+
"num_input_tokens_seen": 65001257824,
|
| 22088 |
"num_train_epochs": 2,
|
| 22089 |
"save_steps": 1000,
|
| 22090 |
"stateful_callbacks": {
|
|
|
|
| 22099 |
"attributes": {}
|
| 22100 |
}
|
| 22101 |
},
|
| 22102 |
+
"total_flos": 1.150403701190529e+20,
|
| 22103 |
"train_batch_size": 32,
|
| 22104 |
"trial_name": null,
|
| 22105 |
"trial_params": null
|