Training in progress, step 131000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18f6247fa697227171786e92b63492b81203ba9ab620eea2a35269c2dc5abc91
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a79b728b1351b728e46db09ab4e3bda84220fcf605f8e84a1af65a7e98ccf401
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13afedcbea29e4911157dfdebca89adaca3015ec55fbe8952619bfb77f49f98b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d530307a60624b67b44a38452390579f46394dc6c46c3e7e0b33446906fdcfb9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -23148,11 +23148,189 @@
|
|
| 23148 |
"eval_steps_per_second": 15.045,
|
| 23149 |
"num_input_tokens_seen": 68146442176,
|
| 23150 |
"step": 130000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23151 |
}
|
| 23152 |
],
|
| 23153 |
"logging_steps": 50,
|
| 23154 |
"max_steps": 140000,
|
| 23155 |
-
"num_input_tokens_seen":
|
| 23156 |
"num_train_epochs": 2,
|
| 23157 |
"save_steps": 1000,
|
| 23158 |
"stateful_callbacks": {
|
|
@@ -23167,7 +23345,7 @@
|
|
| 23167 |
"attributes": {}
|
| 23168 |
}
|
| 23169 |
},
|
| 23170 |
-
"total_flos": 1.
|
| 23171 |
"train_batch_size": 32,
|
| 23172 |
"trial_name": null,
|
| 23173 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.249748977425856,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 131000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 23148 |
"eval_steps_per_second": 15.045,
|
| 23149 |
"num_input_tokens_seen": 68146442176,
|
| 23150 |
"step": 130000
|
| 23151 |
+
},
|
| 23152 |
+
{
|
| 23153 |
+
"epoch": 1.2406859296182788,
|
| 23154 |
+
"grad_norm": 0.12637196481227875,
|
| 23155 |
+
"learning_rate": 0.00028053434571568983,
|
| 23156 |
+
"loss": 2.0543,
|
| 23157 |
+
"num_input_tokens_seen": 68172655040,
|
| 23158 |
+
"step": 130050
|
| 23159 |
+
},
|
| 23160 |
+
{
|
| 23161 |
+
"epoch": 1.241162932134467,
|
| 23162 |
+
"grad_norm": 0.1351892203092575,
|
| 23163 |
+
"learning_rate": 0.000278017467984759,
|
| 23164 |
+
"loss": 2.0578,
|
| 23165 |
+
"num_input_tokens_seen": 68198869440,
|
| 23166 |
+
"step": 130100
|
| 23167 |
+
},
|
| 23168 |
+
{
|
| 23169 |
+
"epoch": 1.2416399346506553,
|
| 23170 |
+
"grad_norm": 0.12203965336084366,
|
| 23171 |
+
"learning_rate": 0.00027550757645927764,
|
| 23172 |
+
"loss": 2.0427,
|
| 23173 |
+
"num_input_tokens_seen": 68225083840,
|
| 23174 |
+
"step": 130150
|
| 23175 |
+
},
|
| 23176 |
+
{
|
| 23177 |
+
"epoch": 1.2421169371668435,
|
| 23178 |
+
"grad_norm": 0.13395994901657104,
|
| 23179 |
+
"learning_rate": 0.00027300475013022663,
|
| 23180 |
+
"loss": 2.0488,
|
| 23181 |
+
"num_input_tokens_seen": 68251293952,
|
| 23182 |
+
"step": 130200
|
| 23183 |
+
},
|
| 23184 |
+
{
|
| 23185 |
+
"epoch": 1.242593939683032,
|
| 23186 |
+
"grad_norm": 0.1291465014219284,
|
| 23187 |
+
"learning_rate": 0.0002705090677662311,
|
| 23188 |
+
"loss": 2.0484,
|
| 23189 |
+
"num_input_tokens_seen": 68277498432,
|
| 23190 |
+
"step": 130250
|
| 23191 |
+
},
|
| 23192 |
+
{
|
| 23193 |
+
"epoch": 1.24307094219922,
|
| 23194 |
+
"grad_norm": 0.12472834438085556,
|
| 23195 |
+
"learning_rate": 0.000268020607911083,
|
| 23196 |
+
"loss": 2.0538,
|
| 23197 |
+
"num_input_tokens_seen": 68303709440,
|
| 23198 |
+
"step": 130300
|
| 23199 |
+
},
|
| 23200 |
+
{
|
| 23201 |
+
"epoch": 1.2435479447154083,
|
| 23202 |
+
"grad_norm": 0.1263572871685028,
|
| 23203 |
+
"learning_rate": 0.0002655394488812677,
|
| 23204 |
+
"loss": 2.0487,
|
| 23205 |
+
"num_input_tokens_seen": 68329920512,
|
| 23206 |
+
"step": 130350
|
| 23207 |
+
},
|
| 23208 |
+
{
|
| 23209 |
+
"epoch": 1.2440249472315967,
|
| 23210 |
+
"grad_norm": 0.12614773213863373,
|
| 23211 |
+
"learning_rate": 0.0002630656687635007,
|
| 23212 |
+
"loss": 2.053,
|
| 23213 |
+
"num_input_tokens_seen": 68356112384,
|
| 23214 |
+
"step": 130400
|
| 23215 |
+
},
|
| 23216 |
+
{
|
| 23217 |
+
"epoch": 1.244501949747785,
|
| 23218 |
+
"grad_norm": 0.1241307333111763,
|
| 23219 |
+
"learning_rate": 0.0002605993454122687,
|
| 23220 |
+
"loss": 2.049,
|
| 23221 |
+
"num_input_tokens_seen": 68382320896,
|
| 23222 |
+
"step": 130450
|
| 23223 |
+
},
|
| 23224 |
+
{
|
| 23225 |
+
"epoch": 1.2449789522639731,
|
| 23226 |
+
"grad_norm": 0.12764516472816467,
|
| 23227 |
+
"learning_rate": 0.0002581405564473801,
|
| 23228 |
+
"loss": 2.0338,
|
| 23229 |
+
"num_input_tokens_seen": 68408534464,
|
| 23230 |
+
"step": 130500
|
| 23231 |
+
},
|
| 23232 |
+
{
|
| 23233 |
+
"epoch": 1.2449789522639731,
|
| 23234 |
+
"eval_loss": 1.9643968343734741,
|
| 23235 |
+
"eval_runtime": 82.7385,
|
| 23236 |
+
"eval_samples_per_second": 60.431,
|
| 23237 |
+
"eval_steps_per_second": 15.108,
|
| 23238 |
+
"num_input_tokens_seen": 68408534464,
|
| 23239 |
+
"step": 130500
|
| 23240 |
+
},
|
| 23241 |
+
{
|
| 23242 |
+
"epoch": 1.2454559547801614,
|
| 23243 |
+
"grad_norm": 0.1308233141899109,
|
| 23244 |
+
"learning_rate": 0.0002556893792515227,
|
| 23245 |
+
"loss": 2.0371,
|
| 23246 |
+
"num_input_tokens_seen": 68434747040,
|
| 23247 |
+
"step": 130550
|
| 23248 |
+
},
|
| 23249 |
+
{
|
| 23250 |
+
"epoch": 1.2459329572963498,
|
| 23251 |
+
"grad_norm": 0.12745235860347748,
|
| 23252 |
+
"learning_rate": 0.00025324589096782657,
|
| 23253 |
+
"loss": 2.0373,
|
| 23254 |
+
"num_input_tokens_seen": 68460951616,
|
| 23255 |
+
"step": 130600
|
| 23256 |
+
},
|
| 23257 |
+
{
|
| 23258 |
+
"epoch": 1.246409959812538,
|
| 23259 |
+
"grad_norm": 0.1278812736272812,
|
| 23260 |
+
"learning_rate": 0.0002508101684974387,
|
| 23261 |
+
"loss": 2.0405,
|
| 23262 |
+
"num_input_tokens_seen": 68487165696,
|
| 23263 |
+
"step": 130650
|
| 23264 |
+
},
|
| 23265 |
+
{
|
| 23266 |
+
"epoch": 1.2468869623287262,
|
| 23267 |
+
"grad_norm": 0.12204719334840775,
|
| 23268 |
+
"learning_rate": 0.00024838228849709997,
|
| 23269 |
+
"loss": 2.0424,
|
| 23270 |
+
"num_input_tokens_seen": 68513380096,
|
| 23271 |
+
"step": 130700
|
| 23272 |
+
},
|
| 23273 |
+
{
|
| 23274 |
+
"epoch": 1.2473639648449146,
|
| 23275 |
+
"grad_norm": 0.11976956576108932,
|
| 23276 |
+
"learning_rate": 0.0002459623273767354,
|
| 23277 |
+
"loss": 2.0596,
|
| 23278 |
+
"num_input_tokens_seen": 68539590240,
|
| 23279 |
+
"step": 130750
|
| 23280 |
+
},
|
| 23281 |
+
{
|
| 23282 |
+
"epoch": 1.2478409673611028,
|
| 23283 |
+
"grad_norm": 0.13120809197425842,
|
| 23284 |
+
"learning_rate": 0.000243550361297047,
|
| 23285 |
+
"loss": 2.037,
|
| 23286 |
+
"num_input_tokens_seen": 68565804640,
|
| 23287 |
+
"step": 130800
|
| 23288 |
+
},
|
| 23289 |
+
{
|
| 23290 |
+
"epoch": 1.248317969877291,
|
| 23291 |
+
"grad_norm": 0.12905927002429962,
|
| 23292 |
+
"learning_rate": 0.00024114646616711844,
|
| 23293 |
+
"loss": 2.0341,
|
| 23294 |
+
"num_input_tokens_seen": 68592007552,
|
| 23295 |
+
"step": 130850
|
| 23296 |
+
},
|
| 23297 |
+
{
|
| 23298 |
+
"epoch": 1.2487949723934793,
|
| 23299 |
+
"grad_norm": 0.12697407603263855,
|
| 23300 |
+
"learning_rate": 0.00023875071764202561,
|
| 23301 |
+
"loss": 2.05,
|
| 23302 |
+
"num_input_tokens_seen": 68618221952,
|
| 23303 |
+
"step": 130900
|
| 23304 |
+
},
|
| 23305 |
+
{
|
| 23306 |
+
"epoch": 1.2492719749096677,
|
| 23307 |
+
"grad_norm": 0.12694934010505676,
|
| 23308 |
+
"learning_rate": 0.00023636319112045495,
|
| 23309 |
+
"loss": 2.0436,
|
| 23310 |
+
"num_input_tokens_seen": 68644425984,
|
| 23311 |
+
"step": 130950
|
| 23312 |
+
},
|
| 23313 |
+
{
|
| 23314 |
+
"epoch": 1.249748977425856,
|
| 23315 |
+
"grad_norm": 0.1360025703907013,
|
| 23316 |
+
"learning_rate": 0.00023398396174233177,
|
| 23317 |
+
"loss": 2.0506,
|
| 23318 |
+
"num_input_tokens_seen": 68670633664,
|
| 23319 |
+
"step": 131000
|
| 23320 |
+
},
|
| 23321 |
+
{
|
| 23322 |
+
"epoch": 1.249748977425856,
|
| 23323 |
+
"eval_loss": 1.962631106376648,
|
| 23324 |
+
"eval_runtime": 82.4327,
|
| 23325 |
+
"eval_samples_per_second": 60.656,
|
| 23326 |
+
"eval_steps_per_second": 15.164,
|
| 23327 |
+
"num_input_tokens_seen": 68670633664,
|
| 23328 |
+
"step": 131000
|
| 23329 |
}
|
| 23330 |
],
|
| 23331 |
"logging_steps": 50,
|
| 23332 |
"max_steps": 140000,
|
| 23333 |
+
"num_input_tokens_seen": 68670633664,
|
| 23334 |
"num_train_epochs": 2,
|
| 23335 |
"save_steps": 1000,
|
| 23336 |
"stateful_callbacks": {
|
|
|
|
| 23345 |
"attributes": {}
|
| 23346 |
}
|
| 23347 |
},
|
| 23348 |
+
"total_flos": 1.2153449606169969e+20,
|
| 23349 |
"train_batch_size": 32,
|
| 23350 |
"trial_name": null,
|
| 23351 |
"trial_params": null
|