Training in progress, step 35000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39e47431790297c8d1ac0d590138e540ff35b008c08f15b4fec92555b68b3ca0
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cb4360f6e3ef0a4db7ef43d5c8060cb784d63688538fb77fe4f179313685acd
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3505914cea5cefe31834749326fbe845962aa02c10480cbc9f90524db4d28f1f
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c432826b41d4d9850a94ad79c80845280b64911bf27c831beef66a783066385f
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6060,11 +6060,189 @@
|
|
| 6060 |
"eval_steps_per_second": 18.763,
|
| 6061 |
"num_input_tokens_seen": 35651580160,
|
| 6062 |
"step": 34000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6063 |
}
|
| 6064 |
],
|
| 6065 |
"logging_steps": 50,
|
| 6066 |
"max_steps": 200000,
|
| 6067 |
-
"num_input_tokens_seen":
|
| 6068 |
"num_train_epochs": 5,
|
| 6069 |
"save_steps": 1000,
|
| 6070 |
"stateful_callbacks": {
|
|
@@ -6079,7 +6257,7 @@
|
|
| 6079 |
"attributes": {}
|
| 6080 |
}
|
| 6081 |
},
|
| 6082 |
-
"total_flos": 2.
|
| 6083 |
"train_batch_size": 64,
|
| 6084 |
"trial_name": null,
|
| 6085 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.7688115135015657,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 35000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6060 |
"eval_steps_per_second": 18.763,
|
| 6061 |
"num_input_tokens_seen": 35651580160,
|
| 6062 |
"step": 34000
|
| 6063 |
+
},
|
| 6064 |
+
{
|
| 6065 |
+
"epoch": 0.747943772420809,
|
| 6066 |
+
"grad_norm": 0.1586858183145523,
|
| 6067 |
+
"learning_rate": 0.001,
|
| 6068 |
+
"loss": 2.6654,
|
| 6069 |
+
"num_input_tokens_seen": 35704008960,
|
| 6070 |
+
"step": 34050
|
| 6071 |
+
},
|
| 6072 |
+
{
|
| 6073 |
+
"epoch": 0.749042074582954,
|
| 6074 |
+
"grad_norm": 0.1376073956489563,
|
| 6075 |
+
"learning_rate": 0.001,
|
| 6076 |
+
"loss": 2.6627,
|
| 6077 |
+
"num_input_tokens_seen": 35756437760,
|
| 6078 |
+
"step": 34100
|
| 6079 |
+
},
|
| 6080 |
+
{
|
| 6081 |
+
"epoch": 0.7501403767450991,
|
| 6082 |
+
"grad_norm": 0.13904818892478943,
|
| 6083 |
+
"learning_rate": 0.001,
|
| 6084 |
+
"loss": 2.6605,
|
| 6085 |
+
"num_input_tokens_seen": 35808866560,
|
| 6086 |
+
"step": 34150
|
| 6087 |
+
},
|
| 6088 |
+
{
|
| 6089 |
+
"epoch": 0.7512386789072443,
|
| 6090 |
+
"grad_norm": 0.14543947577476501,
|
| 6091 |
+
"learning_rate": 0.001,
|
| 6092 |
+
"loss": 2.6589,
|
| 6093 |
+
"num_input_tokens_seen": 35861295360,
|
| 6094 |
+
"step": 34200
|
| 6095 |
+
},
|
| 6096 |
+
{
|
| 6097 |
+
"epoch": 0.7523369810693894,
|
| 6098 |
+
"grad_norm": 0.14855198562145233,
|
| 6099 |
+
"learning_rate": 0.001,
|
| 6100 |
+
"loss": 2.6612,
|
| 6101 |
+
"num_input_tokens_seen": 35913724160,
|
| 6102 |
+
"step": 34250
|
| 6103 |
+
},
|
| 6104 |
+
{
|
| 6105 |
+
"epoch": 0.7534352832315344,
|
| 6106 |
+
"grad_norm": 0.14492908120155334,
|
| 6107 |
+
"learning_rate": 0.001,
|
| 6108 |
+
"loss": 2.6561,
|
| 6109 |
+
"num_input_tokens_seen": 35966152960,
|
| 6110 |
+
"step": 34300
|
| 6111 |
+
},
|
| 6112 |
+
{
|
| 6113 |
+
"epoch": 0.7545335853936795,
|
| 6114 |
+
"grad_norm": 0.1388978660106659,
|
| 6115 |
+
"learning_rate": 0.001,
|
| 6116 |
+
"loss": 2.6551,
|
| 6117 |
+
"num_input_tokens_seen": 36018581760,
|
| 6118 |
+
"step": 34350
|
| 6119 |
+
},
|
| 6120 |
+
{
|
| 6121 |
+
"epoch": 0.7556318875558247,
|
| 6122 |
+
"grad_norm": 0.14582422375679016,
|
| 6123 |
+
"learning_rate": 0.001,
|
| 6124 |
+
"loss": 2.6521,
|
| 6125 |
+
"num_input_tokens_seen": 36071010560,
|
| 6126 |
+
"step": 34400
|
| 6127 |
+
},
|
| 6128 |
+
{
|
| 6129 |
+
"epoch": 0.7567301897179697,
|
| 6130 |
+
"grad_norm": 0.17488695681095123,
|
| 6131 |
+
"learning_rate": 0.001,
|
| 6132 |
+
"loss": 2.6516,
|
| 6133 |
+
"num_input_tokens_seen": 36123439360,
|
| 6134 |
+
"step": 34450
|
| 6135 |
+
},
|
| 6136 |
+
{
|
| 6137 |
+
"epoch": 0.7578284918801148,
|
| 6138 |
+
"grad_norm": 0.12302416563034058,
|
| 6139 |
+
"learning_rate": 0.001,
|
| 6140 |
+
"loss": 2.6617,
|
| 6141 |
+
"num_input_tokens_seen": 36175868160,
|
| 6142 |
+
"step": 34500
|
| 6143 |
+
},
|
| 6144 |
+
{
|
| 6145 |
+
"epoch": 0.7578284918801148,
|
| 6146 |
+
"eval_loss": 2.5549991130828857,
|
| 6147 |
+
"eval_runtime": 67.5095,
|
| 6148 |
+
"eval_samples_per_second": 74.064,
|
| 6149 |
+
"eval_steps_per_second": 18.516,
|
| 6150 |
+
"num_input_tokens_seen": 36175868160,
|
| 6151 |
+
"step": 34500
|
| 6152 |
+
},
|
| 6153 |
+
{
|
| 6154 |
+
"epoch": 0.7589267940422599,
|
| 6155 |
+
"grad_norm": 0.14238396286964417,
|
| 6156 |
+
"learning_rate": 0.001,
|
| 6157 |
+
"loss": 2.6609,
|
| 6158 |
+
"num_input_tokens_seen": 36228296960,
|
| 6159 |
+
"step": 34550
|
| 6160 |
+
},
|
| 6161 |
+
{
|
| 6162 |
+
"epoch": 0.7600250962044051,
|
| 6163 |
+
"grad_norm": 0.17919403314590454,
|
| 6164 |
+
"learning_rate": 0.001,
|
| 6165 |
+
"loss": 2.6621,
|
| 6166 |
+
"num_input_tokens_seen": 36280725760,
|
| 6167 |
+
"step": 34600
|
| 6168 |
+
},
|
| 6169 |
+
{
|
| 6170 |
+
"epoch": 0.7611233983665501,
|
| 6171 |
+
"grad_norm": 0.13188666105270386,
|
| 6172 |
+
"learning_rate": 0.001,
|
| 6173 |
+
"loss": 2.6529,
|
| 6174 |
+
"num_input_tokens_seen": 36333154560,
|
| 6175 |
+
"step": 34650
|
| 6176 |
+
},
|
| 6177 |
+
{
|
| 6178 |
+
"epoch": 0.7622217005286952,
|
| 6179 |
+
"grad_norm": 0.16191646456718445,
|
| 6180 |
+
"learning_rate": 0.001,
|
| 6181 |
+
"loss": 2.6584,
|
| 6182 |
+
"num_input_tokens_seen": 36385583360,
|
| 6183 |
+
"step": 34700
|
| 6184 |
+
},
|
| 6185 |
+
{
|
| 6186 |
+
"epoch": 0.7633200026908403,
|
| 6187 |
+
"grad_norm": 0.14606165885925293,
|
| 6188 |
+
"learning_rate": 0.001,
|
| 6189 |
+
"loss": 2.6567,
|
| 6190 |
+
"num_input_tokens_seen": 36438012160,
|
| 6191 |
+
"step": 34750
|
| 6192 |
+
},
|
| 6193 |
+
{
|
| 6194 |
+
"epoch": 0.7644183048529853,
|
| 6195 |
+
"grad_norm": 0.1648443192243576,
|
| 6196 |
+
"learning_rate": 0.001,
|
| 6197 |
+
"loss": 2.6587,
|
| 6198 |
+
"num_input_tokens_seen": 36490440960,
|
| 6199 |
+
"step": 34800
|
| 6200 |
+
},
|
| 6201 |
+
{
|
| 6202 |
+
"epoch": 0.7655166070151305,
|
| 6203 |
+
"grad_norm": 0.19523674249649048,
|
| 6204 |
+
"learning_rate": 0.001,
|
| 6205 |
+
"loss": 2.6662,
|
| 6206 |
+
"num_input_tokens_seen": 36542869760,
|
| 6207 |
+
"step": 34850
|
| 6208 |
+
},
|
| 6209 |
+
{
|
| 6210 |
+
"epoch": 0.7666149091772756,
|
| 6211 |
+
"grad_norm": 0.1713179498910904,
|
| 6212 |
+
"learning_rate": 0.001,
|
| 6213 |
+
"loss": 2.6683,
|
| 6214 |
+
"num_input_tokens_seen": 36595298560,
|
| 6215 |
+
"step": 34900
|
| 6216 |
+
},
|
| 6217 |
+
{
|
| 6218 |
+
"epoch": 0.7677132113394207,
|
| 6219 |
+
"grad_norm": 0.14923711121082306,
|
| 6220 |
+
"learning_rate": 0.001,
|
| 6221 |
+
"loss": 2.6629,
|
| 6222 |
+
"num_input_tokens_seen": 36647727360,
|
| 6223 |
+
"step": 34950
|
| 6224 |
+
},
|
| 6225 |
+
{
|
| 6226 |
+
"epoch": 0.7688115135015657,
|
| 6227 |
+
"grad_norm": 0.13948023319244385,
|
| 6228 |
+
"learning_rate": 0.001,
|
| 6229 |
+
"loss": 2.6619,
|
| 6230 |
+
"num_input_tokens_seen": 36700156160,
|
| 6231 |
+
"step": 35000
|
| 6232 |
+
},
|
| 6233 |
+
{
|
| 6234 |
+
"epoch": 0.7688115135015657,
|
| 6235 |
+
"eval_loss": 2.5569379329681396,
|
| 6236 |
+
"eval_runtime": 67.9393,
|
| 6237 |
+
"eval_samples_per_second": 73.595,
|
| 6238 |
+
"eval_steps_per_second": 18.399,
|
| 6239 |
+
"num_input_tokens_seen": 36700156160,
|
| 6240 |
+
"step": 35000
|
| 6241 |
}
|
| 6242 |
],
|
| 6243 |
"logging_steps": 50,
|
| 6244 |
"max_steps": 200000,
|
| 6245 |
+
"num_input_tokens_seen": 36700156160,
|
| 6246 |
"num_train_epochs": 5,
|
| 6247 |
"save_steps": 1000,
|
| 6248 |
"stateful_callbacks": {
|
|
|
|
| 6257 |
"attributes": {}
|
| 6258 |
}
|
| 6259 |
},
|
| 6260 |
+
"total_flos": 2.090100787186434e+19,
|
| 6261 |
"train_batch_size": 64,
|
| 6262 |
"trial_name": null,
|
| 6263 |
"trial_params": null
|