Training in progress, step 36000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efae6f25b472bb0e65dad8b999f6e73025004f81855c74ec54fc8ecdd3a25a3d
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcd1e62e2d3104319cb00e159562f9ab40349a35045ca52ca467e6336a9d4925
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3105f55ffa4117a580fe7ec380b19db2b68da0c57679e9557361f205c3d7ca03
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5a2a9a97378c1b7631d78a28de277749231ad65f077c045df73323c2c2b85da
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6238,11 +6238,189 @@
|
|
| 6238 |
"eval_steps_per_second": 18.399,
|
| 6239 |
"num_input_tokens_seen": 36700156160,
|
| 6240 |
"step": 35000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6241 |
}
|
| 6242 |
],
|
| 6243 |
"logging_steps": 50,
|
| 6244 |
"max_steps": 200000,
|
| 6245 |
-
"num_input_tokens_seen":
|
| 6246 |
"num_train_epochs": 5,
|
| 6247 |
"save_steps": 1000,
|
| 6248 |
"stateful_callbacks": {
|
|
@@ -6257,7 +6435,7 @@
|
|
| 6257 |
"attributes": {}
|
| 6258 |
}
|
| 6259 |
},
|
| 6260 |
-
"total_flos": 2.
|
| 6261 |
"train_batch_size": 64,
|
| 6262 |
"trial_name": null,
|
| 6263 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.7907775567444676,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 36000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6238 |
"eval_steps_per_second": 18.399,
|
| 6239 |
"num_input_tokens_seen": 36700156160,
|
| 6240 |
"step": 35000
|
| 6241 |
+
},
|
| 6242 |
+
{
|
| 6243 |
+
"epoch": 0.7699098156637109,
|
| 6244 |
+
"grad_norm": 0.14624406397342682,
|
| 6245 |
+
"learning_rate": 0.001,
|
| 6246 |
+
"loss": 2.657,
|
| 6247 |
+
"num_input_tokens_seen": 36752584960,
|
| 6248 |
+
"step": 35050
|
| 6249 |
+
},
|
| 6250 |
+
{
|
| 6251 |
+
"epoch": 0.771008117825856,
|
| 6252 |
+
"grad_norm": 0.16855786740779877,
|
| 6253 |
+
"learning_rate": 0.001,
|
| 6254 |
+
"loss": 2.6585,
|
| 6255 |
+
"num_input_tokens_seen": 36805013760,
|
| 6256 |
+
"step": 35100
|
| 6257 |
+
},
|
| 6258 |
+
{
|
| 6259 |
+
"epoch": 0.772106419988001,
|
| 6260 |
+
"grad_norm": 0.1439932882785797,
|
| 6261 |
+
"learning_rate": 0.001,
|
| 6262 |
+
"loss": 2.6653,
|
| 6263 |
+
"num_input_tokens_seen": 36857442560,
|
| 6264 |
+
"step": 35150
|
| 6265 |
+
},
|
| 6266 |
+
{
|
| 6267 |
+
"epoch": 0.7732047221501461,
|
| 6268 |
+
"grad_norm": 0.16299331188201904,
|
| 6269 |
+
"learning_rate": 0.001,
|
| 6270 |
+
"loss": 2.6621,
|
| 6271 |
+
"num_input_tokens_seen": 36909871360,
|
| 6272 |
+
"step": 35200
|
| 6273 |
+
},
|
| 6274 |
+
{
|
| 6275 |
+
"epoch": 0.7743030243122913,
|
| 6276 |
+
"grad_norm": 0.16961826384067535,
|
| 6277 |
+
"learning_rate": 0.001,
|
| 6278 |
+
"loss": 2.6545,
|
| 6279 |
+
"num_input_tokens_seen": 36962300160,
|
| 6280 |
+
"step": 35250
|
| 6281 |
+
},
|
| 6282 |
+
{
|
| 6283 |
+
"epoch": 0.7754013264744364,
|
| 6284 |
+
"grad_norm": 0.13337954878807068,
|
| 6285 |
+
"learning_rate": 0.001,
|
| 6286 |
+
"loss": 2.652,
|
| 6287 |
+
"num_input_tokens_seen": 37014728960,
|
| 6288 |
+
"step": 35300
|
| 6289 |
+
},
|
| 6290 |
+
{
|
| 6291 |
+
"epoch": 0.7764996286365814,
|
| 6292 |
+
"grad_norm": 0.1728074699640274,
|
| 6293 |
+
"learning_rate": 0.001,
|
| 6294 |
+
"loss": 2.6631,
|
| 6295 |
+
"num_input_tokens_seen": 37067157760,
|
| 6296 |
+
"step": 35350
|
| 6297 |
+
},
|
| 6298 |
+
{
|
| 6299 |
+
"epoch": 0.7775979307987265,
|
| 6300 |
+
"grad_norm": 0.16615192592144012,
|
| 6301 |
+
"learning_rate": 0.001,
|
| 6302 |
+
"loss": 2.6551,
|
| 6303 |
+
"num_input_tokens_seen": 37119586560,
|
| 6304 |
+
"step": 35400
|
| 6305 |
+
},
|
| 6306 |
+
{
|
| 6307 |
+
"epoch": 0.7786962329608716,
|
| 6308 |
+
"grad_norm": 0.1515650749206543,
|
| 6309 |
+
"learning_rate": 0.001,
|
| 6310 |
+
"loss": 2.6529,
|
| 6311 |
+
"num_input_tokens_seen": 37172015360,
|
| 6312 |
+
"step": 35450
|
| 6313 |
+
},
|
| 6314 |
+
{
|
| 6315 |
+
"epoch": 0.7797945351230167,
|
| 6316 |
+
"grad_norm": 0.1534053236246109,
|
| 6317 |
+
"learning_rate": 0.001,
|
| 6318 |
+
"loss": 2.6567,
|
| 6319 |
+
"num_input_tokens_seen": 37224444160,
|
| 6320 |
+
"step": 35500
|
| 6321 |
+
},
|
| 6322 |
+
{
|
| 6323 |
+
"epoch": 0.7797945351230167,
|
| 6324 |
+
"eval_loss": 2.55454683303833,
|
| 6325 |
+
"eval_runtime": 67.0727,
|
| 6326 |
+
"eval_samples_per_second": 74.546,
|
| 6327 |
+
"eval_steps_per_second": 18.637,
|
| 6328 |
+
"num_input_tokens_seen": 37224444160,
|
| 6329 |
+
"step": 35500
|
| 6330 |
+
},
|
| 6331 |
+
{
|
| 6332 |
+
"epoch": 0.7808928372851618,
|
| 6333 |
+
"grad_norm": 0.16377541422843933,
|
| 6334 |
+
"learning_rate": 0.001,
|
| 6335 |
+
"loss": 2.6552,
|
| 6336 |
+
"num_input_tokens_seen": 37276872960,
|
| 6337 |
+
"step": 35550
|
| 6338 |
+
},
|
| 6339 |
+
{
|
| 6340 |
+
"epoch": 0.7819911394473069,
|
| 6341 |
+
"grad_norm": 0.14807477593421936,
|
| 6342 |
+
"learning_rate": 0.001,
|
| 6343 |
+
"loss": 2.6563,
|
| 6344 |
+
"num_input_tokens_seen": 37329301760,
|
| 6345 |
+
"step": 35600
|
| 6346 |
+
},
|
| 6347 |
+
{
|
| 6348 |
+
"epoch": 0.783089441609452,
|
| 6349 |
+
"grad_norm": 0.13599660992622375,
|
| 6350 |
+
"learning_rate": 0.001,
|
| 6351 |
+
"loss": 2.6575,
|
| 6352 |
+
"num_input_tokens_seen": 37381730560,
|
| 6353 |
+
"step": 35650
|
| 6354 |
+
},
|
| 6355 |
+
{
|
| 6356 |
+
"epoch": 0.7841877437715971,
|
| 6357 |
+
"grad_norm": 0.16653482615947723,
|
| 6358 |
+
"learning_rate": 0.001,
|
| 6359 |
+
"loss": 2.6515,
|
| 6360 |
+
"num_input_tokens_seen": 37434159360,
|
| 6361 |
+
"step": 35700
|
| 6362 |
+
},
|
| 6363 |
+
{
|
| 6364 |
+
"epoch": 0.7852860459337422,
|
| 6365 |
+
"grad_norm": 0.15467293560504913,
|
| 6366 |
+
"learning_rate": 0.001,
|
| 6367 |
+
"loss": 2.6548,
|
| 6368 |
+
"num_input_tokens_seen": 37486588160,
|
| 6369 |
+
"step": 35750
|
| 6370 |
+
},
|
| 6371 |
+
{
|
| 6372 |
+
"epoch": 0.7863843480958873,
|
| 6373 |
+
"grad_norm": 0.4751467704772949,
|
| 6374 |
+
"learning_rate": 0.001,
|
| 6375 |
+
"loss": 2.6592,
|
| 6376 |
+
"num_input_tokens_seen": 37539016960,
|
| 6377 |
+
"step": 35800
|
| 6378 |
+
},
|
| 6379 |
+
{
|
| 6380 |
+
"epoch": 0.7874826502580323,
|
| 6381 |
+
"grad_norm": 0.15940867364406586,
|
| 6382 |
+
"learning_rate": 0.001,
|
| 6383 |
+
"loss": 2.6624,
|
| 6384 |
+
"num_input_tokens_seen": 37591445760,
|
| 6385 |
+
"step": 35850
|
| 6386 |
+
},
|
| 6387 |
+
{
|
| 6388 |
+
"epoch": 0.7885809524201775,
|
| 6389 |
+
"grad_norm": 0.137634739279747,
|
| 6390 |
+
"learning_rate": 0.001,
|
| 6391 |
+
"loss": 2.6559,
|
| 6392 |
+
"num_input_tokens_seen": 37643874560,
|
| 6393 |
+
"step": 35900
|
| 6394 |
+
},
|
| 6395 |
+
{
|
| 6396 |
+
"epoch": 0.7896792545823226,
|
| 6397 |
+
"grad_norm": 0.16022460162639618,
|
| 6398 |
+
"learning_rate": 0.001,
|
| 6399 |
+
"loss": 2.6555,
|
| 6400 |
+
"num_input_tokens_seen": 37696303360,
|
| 6401 |
+
"step": 35950
|
| 6402 |
+
},
|
| 6403 |
+
{
|
| 6404 |
+
"epoch": 0.7907775567444676,
|
| 6405 |
+
"grad_norm": 0.147109717130661,
|
| 6406 |
+
"learning_rate": 0.001,
|
| 6407 |
+
"loss": 2.663,
|
| 6408 |
+
"num_input_tokens_seen": 37748732160,
|
| 6409 |
+
"step": 36000
|
| 6410 |
+
},
|
| 6411 |
+
{
|
| 6412 |
+
"epoch": 0.7907775567444676,
|
| 6413 |
+
"eval_loss": 2.556107521057129,
|
| 6414 |
+
"eval_runtime": 67.1814,
|
| 6415 |
+
"eval_samples_per_second": 74.425,
|
| 6416 |
+
"eval_steps_per_second": 18.606,
|
| 6417 |
+
"num_input_tokens_seen": 37748732160,
|
| 6418 |
+
"step": 36000
|
| 6419 |
}
|
| 6420 |
],
|
| 6421 |
"logging_steps": 50,
|
| 6422 |
"max_steps": 200000,
|
| 6423 |
+
"num_input_tokens_seen": 37748732160,
|
| 6424 |
"num_train_epochs": 5,
|
| 6425 |
"save_steps": 1000,
|
| 6426 |
"stateful_callbacks": {
|
|
|
|
| 6435 |
"attributes": {}
|
| 6436 |
}
|
| 6437 |
},
|
| 6438 |
+
"total_flos": 2.149817958782927e+19,
|
| 6439 |
"train_batch_size": 64,
|
| 6440 |
"trial_name": null,
|
| 6441 |
"trial_params": null
|