Training in progress, step 54000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7005ee4ac699efbe46e787cdaab363f958cca84ce68e125ca53c53198e13eeac
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5151b63ca0c165877166c8eeb6faa3b784251ae57745f30c89f3dbaf08defd7
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e516d1931a63763a7fdfb84f01f54aaada25beb218520b62969ba08ff897cee4
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b89459823d581d70469027e8df5427d5b9a07aadbd42c55eac43368b994e74e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9442,11 +9442,189 @@
|
|
| 9442 |
"eval_steps_per_second": 23.488,
|
| 9443 |
"num_input_tokens_seen": 13893632000,
|
| 9444 |
"step": 53000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9445 |
}
|
| 9446 |
],
|
| 9447 |
"logging_steps": 50,
|
| 9448 |
"max_steps": 60000,
|
| 9449 |
-
"num_input_tokens_seen":
|
| 9450 |
"num_train_epochs": 1,
|
| 9451 |
"save_steps": 1000,
|
| 9452 |
"stateful_callbacks": {
|
|
@@ -9461,7 +9639,7 @@
|
|
| 9461 |
"attributes": {}
|
| 9462 |
}
|
| 9463 |
},
|
| 9464 |
-
"total_flos": 3.
|
| 9465 |
"train_batch_size": 64,
|
| 9466 |
"trial_name": null,
|
| 9467 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.36323093920422156,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 54000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9442 |
"eval_steps_per_second": 23.488,
|
| 9443 |
"num_input_tokens_seen": 13893632000,
|
| 9444 |
"step": 53000
|
| 9445 |
+
},
|
| 9446 |
+
{
|
| 9447 |
+
"epoch": 0.3568407652737769,
|
| 9448 |
+
"grad_norm": 0.3147699236869812,
|
| 9449 |
+
"learning_rate": 0.001,
|
| 9450 |
+
"loss": 3.0557,
|
| 9451 |
+
"num_input_tokens_seen": 13906739200,
|
| 9452 |
+
"step": 53050
|
| 9453 |
+
},
|
| 9454 |
+
{
|
| 9455 |
+
"epoch": 0.3571770902174845,
|
| 9456 |
+
"grad_norm": 0.22110533714294434,
|
| 9457 |
+
"learning_rate": 0.001,
|
| 9458 |
+
"loss": 3.0515,
|
| 9459 |
+
"num_input_tokens_seen": 13919846400,
|
| 9460 |
+
"step": 53100
|
| 9461 |
+
},
|
| 9462 |
+
{
|
| 9463 |
+
"epoch": 0.3575134151611921,
|
| 9464 |
+
"grad_norm": 0.23334212601184845,
|
| 9465 |
+
"learning_rate": 0.001,
|
| 9466 |
+
"loss": 3.0523,
|
| 9467 |
+
"num_input_tokens_seen": 13932953600,
|
| 9468 |
+
"step": 53150
|
| 9469 |
+
},
|
| 9470 |
+
{
|
| 9471 |
+
"epoch": 0.3578497401048997,
|
| 9472 |
+
"grad_norm": 0.200640469789505,
|
| 9473 |
+
"learning_rate": 0.001,
|
| 9474 |
+
"loss": 3.0621,
|
| 9475 |
+
"num_input_tokens_seen": 13946060800,
|
| 9476 |
+
"step": 53200
|
| 9477 |
+
},
|
| 9478 |
+
{
|
| 9479 |
+
"epoch": 0.35818606504860734,
|
| 9480 |
+
"grad_norm": 0.20875929296016693,
|
| 9481 |
+
"learning_rate": 0.001,
|
| 9482 |
+
"loss": 3.0591,
|
| 9483 |
+
"num_input_tokens_seen": 13959168000,
|
| 9484 |
+
"step": 53250
|
| 9485 |
+
},
|
| 9486 |
+
{
|
| 9487 |
+
"epoch": 0.35852238999231495,
|
| 9488 |
+
"grad_norm": 0.19065573811531067,
|
| 9489 |
+
"learning_rate": 0.001,
|
| 9490 |
+
"loss": 3.0591,
|
| 9491 |
+
"num_input_tokens_seen": 13972275200,
|
| 9492 |
+
"step": 53300
|
| 9493 |
+
},
|
| 9494 |
+
{
|
| 9495 |
+
"epoch": 0.35885871493602256,
|
| 9496 |
+
"grad_norm": 0.18688392639160156,
|
| 9497 |
+
"learning_rate": 0.001,
|
| 9498 |
+
"loss": 3.0475,
|
| 9499 |
+
"num_input_tokens_seen": 13985382400,
|
| 9500 |
+
"step": 53350
|
| 9501 |
+
},
|
| 9502 |
+
{
|
| 9503 |
+
"epoch": 0.3591950398797302,
|
| 9504 |
+
"grad_norm": 0.1864282786846161,
|
| 9505 |
+
"learning_rate": 0.001,
|
| 9506 |
+
"loss": 3.0485,
|
| 9507 |
+
"num_input_tokens_seen": 13998489600,
|
| 9508 |
+
"step": 53400
|
| 9509 |
+
},
|
| 9510 |
+
{
|
| 9511 |
+
"epoch": 0.35953136482343784,
|
| 9512 |
+
"grad_norm": 0.20456114411354065,
|
| 9513 |
+
"learning_rate": 0.001,
|
| 9514 |
+
"loss": 3.0529,
|
| 9515 |
+
"num_input_tokens_seen": 14011596800,
|
| 9516 |
+
"step": 53450
|
| 9517 |
+
},
|
| 9518 |
+
{
|
| 9519 |
+
"epoch": 0.35986768976714545,
|
| 9520 |
+
"grad_norm": 0.24362069368362427,
|
| 9521 |
+
"learning_rate": 0.001,
|
| 9522 |
+
"loss": 3.0444,
|
| 9523 |
+
"num_input_tokens_seen": 14024704000,
|
| 9524 |
+
"step": 53500
|
| 9525 |
+
},
|
| 9526 |
+
{
|
| 9527 |
+
"epoch": 0.35986768976714545,
|
| 9528 |
+
"eval_loss": 2.943416118621826,
|
| 9529 |
+
"eval_runtime": 53.1574,
|
| 9530 |
+
"eval_samples_per_second": 94.06,
|
| 9531 |
+
"eval_steps_per_second": 23.515,
|
| 9532 |
+
"num_input_tokens_seen": 14024704000,
|
| 9533 |
+
"step": 53500
|
| 9534 |
+
},
|
| 9535 |
+
{
|
| 9536 |
+
"epoch": 0.36020401471085306,
|
| 9537 |
+
"grad_norm": 0.19701169431209564,
|
| 9538 |
+
"learning_rate": 0.001,
|
| 9539 |
+
"loss": 3.0513,
|
| 9540 |
+
"num_input_tokens_seen": 14037811200,
|
| 9541 |
+
"step": 53550
|
| 9542 |
+
},
|
| 9543 |
+
{
|
| 9544 |
+
"epoch": 0.36054033965456067,
|
| 9545 |
+
"grad_norm": 0.1785692274570465,
|
| 9546 |
+
"learning_rate": 0.001,
|
| 9547 |
+
"loss": 3.0541,
|
| 9548 |
+
"num_input_tokens_seen": 14050918400,
|
| 9549 |
+
"step": 53600
|
| 9550 |
+
},
|
| 9551 |
+
{
|
| 9552 |
+
"epoch": 0.3608766645982683,
|
| 9553 |
+
"grad_norm": 0.1865462064743042,
|
| 9554 |
+
"learning_rate": 0.001,
|
| 9555 |
+
"loss": 3.0367,
|
| 9556 |
+
"num_input_tokens_seen": 14064025600,
|
| 9557 |
+
"step": 53650
|
| 9558 |
+
},
|
| 9559 |
+
{
|
| 9560 |
+
"epoch": 0.3612129895419759,
|
| 9561 |
+
"grad_norm": 0.4129047095775604,
|
| 9562 |
+
"learning_rate": 0.001,
|
| 9563 |
+
"loss": 3.043,
|
| 9564 |
+
"num_input_tokens_seen": 14077132800,
|
| 9565 |
+
"step": 53700
|
| 9566 |
+
},
|
| 9567 |
+
{
|
| 9568 |
+
"epoch": 0.3615493144856835,
|
| 9569 |
+
"grad_norm": 0.21066440641880035,
|
| 9570 |
+
"learning_rate": 0.001,
|
| 9571 |
+
"loss": 3.0585,
|
| 9572 |
+
"num_input_tokens_seen": 14090240000,
|
| 9573 |
+
"step": 53750
|
| 9574 |
+
},
|
| 9575 |
+
{
|
| 9576 |
+
"epoch": 0.3618856394293911,
|
| 9577 |
+
"grad_norm": 0.6820788383483887,
|
| 9578 |
+
"learning_rate": 0.001,
|
| 9579 |
+
"loss": 3.0534,
|
| 9580 |
+
"num_input_tokens_seen": 14103347200,
|
| 9581 |
+
"step": 53800
|
| 9582 |
+
},
|
| 9583 |
+
{
|
| 9584 |
+
"epoch": 0.3622219643730987,
|
| 9585 |
+
"grad_norm": 0.9664424657821655,
|
| 9586 |
+
"learning_rate": 0.001,
|
| 9587 |
+
"loss": 3.069,
|
| 9588 |
+
"num_input_tokens_seen": 14116454400,
|
| 9589 |
+
"step": 53850
|
| 9590 |
+
},
|
| 9591 |
+
{
|
| 9592 |
+
"epoch": 0.36255828931680634,
|
| 9593 |
+
"grad_norm": 0.35416921973228455,
|
| 9594 |
+
"learning_rate": 0.001,
|
| 9595 |
+
"loss": 3.0629,
|
| 9596 |
+
"num_input_tokens_seen": 14129561600,
|
| 9597 |
+
"step": 53900
|
| 9598 |
+
},
|
| 9599 |
+
{
|
| 9600 |
+
"epoch": 0.36289461426051395,
|
| 9601 |
+
"grad_norm": 0.3159606158733368,
|
| 9602 |
+
"learning_rate": 0.001,
|
| 9603 |
+
"loss": 3.0722,
|
| 9604 |
+
"num_input_tokens_seen": 14142668800,
|
| 9605 |
+
"step": 53950
|
| 9606 |
+
},
|
| 9607 |
+
{
|
| 9608 |
+
"epoch": 0.36323093920422156,
|
| 9609 |
+
"grad_norm": 0.2518790662288666,
|
| 9610 |
+
"learning_rate": 0.001,
|
| 9611 |
+
"loss": 3.071,
|
| 9612 |
+
"num_input_tokens_seen": 14155776000,
|
| 9613 |
+
"step": 54000
|
| 9614 |
+
},
|
| 9615 |
+
{
|
| 9616 |
+
"epoch": 0.36323093920422156,
|
| 9617 |
+
"eval_loss": 2.9483964443206787,
|
| 9618 |
+
"eval_runtime": 53.2042,
|
| 9619 |
+
"eval_samples_per_second": 93.978,
|
| 9620 |
+
"eval_steps_per_second": 23.494,
|
| 9621 |
+
"num_input_tokens_seen": 14155776000,
|
| 9622 |
+
"step": 54000
|
| 9623 |
}
|
| 9624 |
],
|
| 9625 |
"logging_steps": 50,
|
| 9626 |
"max_steps": 60000,
|
| 9627 |
+
"num_input_tokens_seen": 14155776000,
|
| 9628 |
"num_train_epochs": 1,
|
| 9629 |
"save_steps": 1000,
|
| 9630 |
"stateful_callbacks": {
|
|
|
|
| 9639 |
"attributes": {}
|
| 9640 |
}
|
| 9641 |
},
|
| 9642 |
+
"total_flos": 3.78680824037376e+18,
|
| 9643 |
"train_batch_size": 64,
|
| 9644 |
"trial_name": null,
|
| 9645 |
"trial_params": null
|