Training in progress, step 54000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa0a1572ea481edcf54695292d1afeb45339e9d2c4b988649cdfc68bb148006f
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88539436cc04b7ab674c4b703cc9d0b734fa709baeed11a8b4233a791dc8b00e
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e516d1931a63763a7fdfb84f01f54aaada25beb218520b62969ba08ff897cee4
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b89459823d581d70469027e8df5427d5b9a07aadbd42c55eac43368b994e74e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9442,11 +9442,189 @@
|
|
| 9442 |
"eval_steps_per_second": 23.269,
|
| 9443 |
"num_input_tokens_seen": 13893627456,
|
| 9444 |
"step": 53000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9445 |
}
|
| 9446 |
],
|
| 9447 |
"logging_steps": 50,
|
| 9448 |
"max_steps": 70000,
|
| 9449 |
-
"num_input_tokens_seen":
|
| 9450 |
"num_train_epochs": 1,
|
| 9451 |
"save_steps": 1000,
|
| 9452 |
"stateful_callbacks": {
|
|
@@ -9461,7 +9639,7 @@
|
|
| 9461 |
"attributes": {}
|
| 9462 |
}
|
| 9463 |
},
|
| 9464 |
-
"total_flos": 3.
|
| 9465 |
"train_batch_size": 64,
|
| 9466 |
"trial_name": null,
|
| 9467 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2575813587416674,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 54000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9442 |
"eval_steps_per_second": 23.269,
|
| 9443 |
"num_input_tokens_seen": 13893627456,
|
| 9444 |
"step": 53000
|
| 9445 |
+
},
|
| 9446 |
+
{
|
| 9447 |
+
"epoch": 0.2530498348378788,
|
| 9448 |
+
"grad_norm": 0.20486177504062653,
|
| 9449 |
+
"learning_rate": 0.001,
|
| 9450 |
+
"loss": 2.5977,
|
| 9451 |
+
"num_input_tokens_seen": 13906734656,
|
| 9452 |
+
"step": 53050
|
| 9453 |
+
},
|
| 9454 |
+
{
|
| 9455 |
+
"epoch": 0.2532883360959729,
|
| 9456 |
+
"grad_norm": 0.18098385632038116,
|
| 9457 |
+
"learning_rate": 0.001,
|
| 9458 |
+
"loss": 2.5931,
|
| 9459 |
+
"num_input_tokens_seen": 13919841856,
|
| 9460 |
+
"step": 53100
|
| 9461 |
+
},
|
| 9462 |
+
{
|
| 9463 |
+
"epoch": 0.25352683735406706,
|
| 9464 |
+
"grad_norm": 0.1933833658695221,
|
| 9465 |
+
"learning_rate": 0.001,
|
| 9466 |
+
"loss": 2.6058,
|
| 9467 |
+
"num_input_tokens_seen": 13932949056,
|
| 9468 |
+
"step": 53150
|
| 9469 |
+
},
|
| 9470 |
+
{
|
| 9471 |
+
"epoch": 0.25376533861216116,
|
| 9472 |
+
"grad_norm": 0.29640141129493713,
|
| 9473 |
+
"learning_rate": 0.001,
|
| 9474 |
+
"loss": 2.5864,
|
| 9475 |
+
"num_input_tokens_seen": 13946056256,
|
| 9476 |
+
"step": 53200
|
| 9477 |
+
},
|
| 9478 |
+
{
|
| 9479 |
+
"epoch": 0.2540038398702553,
|
| 9480 |
+
"grad_norm": 0.2559553384780884,
|
| 9481 |
+
"learning_rate": 0.001,
|
| 9482 |
+
"loss": 2.6137,
|
| 9483 |
+
"num_input_tokens_seen": 13959163456,
|
| 9484 |
+
"step": 53250
|
| 9485 |
+
},
|
| 9486 |
+
{
|
| 9487 |
+
"epoch": 0.2542423411283494,
|
| 9488 |
+
"grad_norm": 0.21698619425296783,
|
| 9489 |
+
"learning_rate": 0.001,
|
| 9490 |
+
"loss": 2.6184,
|
| 9491 |
+
"num_input_tokens_seen": 13972270656,
|
| 9492 |
+
"step": 53300
|
| 9493 |
+
},
|
| 9494 |
+
{
|
| 9495 |
+
"epoch": 0.2544808423864436,
|
| 9496 |
+
"grad_norm": 0.19658173620700836,
|
| 9497 |
+
"learning_rate": 0.001,
|
| 9498 |
+
"loss": 2.5938,
|
| 9499 |
+
"num_input_tokens_seen": 13985377856,
|
| 9500 |
+
"step": 53350
|
| 9501 |
+
},
|
| 9502 |
+
{
|
| 9503 |
+
"epoch": 0.25471934364453774,
|
| 9504 |
+
"grad_norm": 0.2056342512369156,
|
| 9505 |
+
"learning_rate": 0.001,
|
| 9506 |
+
"loss": 2.5952,
|
| 9507 |
+
"num_input_tokens_seen": 13998485056,
|
| 9508 |
+
"step": 53400
|
| 9509 |
+
},
|
| 9510 |
+
{
|
| 9511 |
+
"epoch": 0.25495784490263185,
|
| 9512 |
+
"grad_norm": 0.1932424008846283,
|
| 9513 |
+
"learning_rate": 0.001,
|
| 9514 |
+
"loss": 2.6101,
|
| 9515 |
+
"num_input_tokens_seen": 14011592256,
|
| 9516 |
+
"step": 53450
|
| 9517 |
+
},
|
| 9518 |
+
{
|
| 9519 |
+
"epoch": 0.255196346160726,
|
| 9520 |
+
"grad_norm": 0.19347251951694489,
|
| 9521 |
+
"learning_rate": 0.001,
|
| 9522 |
+
"loss": 2.5976,
|
| 9523 |
+
"num_input_tokens_seen": 14024699456,
|
| 9524 |
+
"step": 53500
|
| 9525 |
+
},
|
| 9526 |
+
{
|
| 9527 |
+
"epoch": 0.255196346160726,
|
| 9528 |
+
"eval_loss": 2.4863245487213135,
|
| 9529 |
+
"eval_runtime": 53.2426,
|
| 9530 |
+
"eval_samples_per_second": 93.91,
|
| 9531 |
+
"eval_steps_per_second": 23.477,
|
| 9532 |
+
"num_input_tokens_seen": 14024699456,
|
| 9533 |
+
"step": 53500
|
| 9534 |
+
},
|
| 9535 |
+
{
|
| 9536 |
+
"epoch": 0.2554348474188201,
|
| 9537 |
+
"grad_norm": 0.1986820101737976,
|
| 9538 |
+
"learning_rate": 0.001,
|
| 9539 |
+
"loss": 2.6066,
|
| 9540 |
+
"num_input_tokens_seen": 14037806656,
|
| 9541 |
+
"step": 53550
|
| 9542 |
+
},
|
| 9543 |
+
{
|
| 9544 |
+
"epoch": 0.25567334867691427,
|
| 9545 |
+
"grad_norm": 0.21295565366744995,
|
| 9546 |
+
"learning_rate": 0.001,
|
| 9547 |
+
"loss": 2.6107,
|
| 9548 |
+
"num_input_tokens_seen": 14050913856,
|
| 9549 |
+
"step": 53600
|
| 9550 |
+
},
|
| 9551 |
+
{
|
| 9552 |
+
"epoch": 0.25591184993500843,
|
| 9553 |
+
"grad_norm": 0.21585114300251007,
|
| 9554 |
+
"learning_rate": 0.001,
|
| 9555 |
+
"loss": 2.6077,
|
| 9556 |
+
"num_input_tokens_seen": 14064021056,
|
| 9557 |
+
"step": 53650
|
| 9558 |
+
},
|
| 9559 |
+
{
|
| 9560 |
+
"epoch": 0.25615035119310253,
|
| 9561 |
+
"grad_norm": 0.19424305856227875,
|
| 9562 |
+
"learning_rate": 0.001,
|
| 9563 |
+
"loss": 2.5931,
|
| 9564 |
+
"num_input_tokens_seen": 14077128256,
|
| 9565 |
+
"step": 53700
|
| 9566 |
+
},
|
| 9567 |
+
{
|
| 9568 |
+
"epoch": 0.2563888524511967,
|
| 9569 |
+
"grad_norm": 0.20265349745750427,
|
| 9570 |
+
"learning_rate": 0.001,
|
| 9571 |
+
"loss": 2.5901,
|
| 9572 |
+
"num_input_tokens_seen": 14090235456,
|
| 9573 |
+
"step": 53750
|
| 9574 |
+
},
|
| 9575 |
+
{
|
| 9576 |
+
"epoch": 0.2566273537092908,
|
| 9577 |
+
"grad_norm": 1.037636160850525,
|
| 9578 |
+
"learning_rate": 0.001,
|
| 9579 |
+
"loss": 2.5775,
|
| 9580 |
+
"num_input_tokens_seen": 14103342656,
|
| 9581 |
+
"step": 53800
|
| 9582 |
+
},
|
| 9583 |
+
{
|
| 9584 |
+
"epoch": 0.25686585496738495,
|
| 9585 |
+
"grad_norm": 0.32030293345451355,
|
| 9586 |
+
"learning_rate": 0.001,
|
| 9587 |
+
"loss": 2.6242,
|
| 9588 |
+
"num_input_tokens_seen": 14116449856,
|
| 9589 |
+
"step": 53850
|
| 9590 |
+
},
|
| 9591 |
+
{
|
| 9592 |
+
"epoch": 0.2571043562254791,
|
| 9593 |
+
"grad_norm": 0.2339978665113449,
|
| 9594 |
+
"learning_rate": 0.001,
|
| 9595 |
+
"loss": 2.6122,
|
| 9596 |
+
"num_input_tokens_seen": 14129557056,
|
| 9597 |
+
"step": 53900
|
| 9598 |
+
},
|
| 9599 |
+
{
|
| 9600 |
+
"epoch": 0.2573428574835732,
|
| 9601 |
+
"grad_norm": 0.22179783880710602,
|
| 9602 |
+
"learning_rate": 0.001,
|
| 9603 |
+
"loss": 2.6025,
|
| 9604 |
+
"num_input_tokens_seen": 14142664256,
|
| 9605 |
+
"step": 53950
|
| 9606 |
+
},
|
| 9607 |
+
{
|
| 9608 |
+
"epoch": 0.2575813587416674,
|
| 9609 |
+
"grad_norm": 0.22616736590862274,
|
| 9610 |
+
"learning_rate": 0.001,
|
| 9611 |
+
"loss": 2.5916,
|
| 9612 |
+
"num_input_tokens_seen": 14155771456,
|
| 9613 |
+
"step": 54000
|
| 9614 |
+
},
|
| 9615 |
+
{
|
| 9616 |
+
"epoch": 0.2575813587416674,
|
| 9617 |
+
"eval_loss": 2.4871394634246826,
|
| 9618 |
+
"eval_runtime": 53.8695,
|
| 9619 |
+
"eval_samples_per_second": 92.817,
|
| 9620 |
+
"eval_steps_per_second": 23.204,
|
| 9621 |
+
"num_input_tokens_seen": 14155771456,
|
| 9622 |
+
"step": 54000
|
| 9623 |
}
|
| 9624 |
],
|
| 9625 |
"logging_steps": 50,
|
| 9626 |
"max_steps": 70000,
|
| 9627 |
+
"num_input_tokens_seen": 14155771456,
|
| 9628 |
"num_train_epochs": 1,
|
| 9629 |
"save_steps": 1000,
|
| 9630 |
"stateful_callbacks": {
|
|
|
|
| 9639 |
"attributes": {}
|
| 9640 |
}
|
| 9641 |
},
|
| 9642 |
+
"total_flos": 3.7868070248094106e+18,
|
| 9643 |
"train_batch_size": 64,
|
| 9644 |
"trial_name": null,
|
| 9645 |
"trial_params": null
|