Training in progress, step 55000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5142916764b6385c48d096b2a7f336531a047dd5a1c0cd7b8aa09a2fdd35007
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c871f297ec758cbe8e1e4a52c756dfd036112baba8fbed3f20c9699d23ba9b0
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a5eacfa99e53a8a1de73851121ef39f03223e9cc67398ac06a0e84e6dbf4ae3
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aaffe7b6e7bde964bb6e6784b39ca6209cca3589a90aff9795b02fa93025464e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9620,11 +9620,189 @@
|
|
| 9620 |
"eval_steps_per_second": 23.204,
|
| 9621 |
"num_input_tokens_seen": 14155771456,
|
| 9622 |
"step": 54000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9623 |
}
|
| 9624 |
],
|
| 9625 |
"logging_steps": 50,
|
| 9626 |
"max_steps": 70000,
|
| 9627 |
-
"num_input_tokens_seen":
|
| 9628 |
"num_train_epochs": 1,
|
| 9629 |
"save_steps": 1000,
|
| 9630 |
"stateful_callbacks": {
|
|
@@ -9639,7 +9817,7 @@
|
|
| 9639 |
"attributes": {}
|
| 9640 |
}
|
| 9641 |
},
|
| 9642 |
-
"total_flos": 3.
|
| 9643 |
"train_batch_size": 64,
|
| 9644 |
"trial_name": null,
|
| 9645 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2623513839035501,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 55000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9620 |
"eval_steps_per_second": 23.204,
|
| 9621 |
"num_input_tokens_seen": 14155771456,
|
| 9622 |
"step": 54000
|
| 9623 |
+
},
|
| 9624 |
+
{
|
| 9625 |
+
"epoch": 0.2578198599997615,
|
| 9626 |
+
"grad_norm": 0.2028844654560089,
|
| 9627 |
+
"learning_rate": 0.001,
|
| 9628 |
+
"loss": 2.6039,
|
| 9629 |
+
"num_input_tokens_seen": 14168878656,
|
| 9630 |
+
"step": 54050
|
| 9631 |
+
},
|
| 9632 |
+
{
|
| 9633 |
+
"epoch": 0.25805836125785564,
|
| 9634 |
+
"grad_norm": 0.19936658442020416,
|
| 9635 |
+
"learning_rate": 0.001,
|
| 9636 |
+
"loss": 2.5985,
|
| 9637 |
+
"num_input_tokens_seen": 14181985856,
|
| 9638 |
+
"step": 54100
|
| 9639 |
+
},
|
| 9640 |
+
{
|
| 9641 |
+
"epoch": 0.2582968625159498,
|
| 9642 |
+
"grad_norm": 0.2087993025779724,
|
| 9643 |
+
"learning_rate": 0.001,
|
| 9644 |
+
"loss": 2.62,
|
| 9645 |
+
"num_input_tokens_seen": 14195093056,
|
| 9646 |
+
"step": 54150
|
| 9647 |
+
},
|
| 9648 |
+
{
|
| 9649 |
+
"epoch": 0.2585353637740439,
|
| 9650 |
+
"grad_norm": 0.18972960114479065,
|
| 9651 |
+
"learning_rate": 0.001,
|
| 9652 |
+
"loss": 2.5936,
|
| 9653 |
+
"num_input_tokens_seen": 14208200256,
|
| 9654 |
+
"step": 54200
|
| 9655 |
+
},
|
| 9656 |
+
{
|
| 9657 |
+
"epoch": 0.25877386503213806,
|
| 9658 |
+
"grad_norm": 0.2162945419549942,
|
| 9659 |
+
"learning_rate": 0.001,
|
| 9660 |
+
"loss": 2.6125,
|
| 9661 |
+
"num_input_tokens_seen": 14221307456,
|
| 9662 |
+
"step": 54250
|
| 9663 |
+
},
|
| 9664 |
+
{
|
| 9665 |
+
"epoch": 0.25901236629023217,
|
| 9666 |
+
"grad_norm": 0.2538411319255829,
|
| 9667 |
+
"learning_rate": 0.001,
|
| 9668 |
+
"loss": 2.6197,
|
| 9669 |
+
"num_input_tokens_seen": 14234414656,
|
| 9670 |
+
"step": 54300
|
| 9671 |
+
},
|
| 9672 |
+
{
|
| 9673 |
+
"epoch": 0.2592508675483263,
|
| 9674 |
+
"grad_norm": 0.28060850501060486,
|
| 9675 |
+
"learning_rate": 0.001,
|
| 9676 |
+
"loss": 2.6194,
|
| 9677 |
+
"num_input_tokens_seen": 14247521856,
|
| 9678 |
+
"step": 54350
|
| 9679 |
+
},
|
| 9680 |
+
{
|
| 9681 |
+
"epoch": 0.25948936880642043,
|
| 9682 |
+
"grad_norm": 0.21557608246803284,
|
| 9683 |
+
"learning_rate": 0.001,
|
| 9684 |
+
"loss": 2.623,
|
| 9685 |
+
"num_input_tokens_seen": 14260629056,
|
| 9686 |
+
"step": 54400
|
| 9687 |
+
},
|
| 9688 |
+
{
|
| 9689 |
+
"epoch": 0.2597278700645146,
|
| 9690 |
+
"grad_norm": 0.21628426015377045,
|
| 9691 |
+
"learning_rate": 0.001,
|
| 9692 |
+
"loss": 2.6077,
|
| 9693 |
+
"num_input_tokens_seen": 14273736256,
|
| 9694 |
+
"step": 54450
|
| 9695 |
+
},
|
| 9696 |
+
{
|
| 9697 |
+
"epoch": 0.25996637132260875,
|
| 9698 |
+
"grad_norm": 0.19123327732086182,
|
| 9699 |
+
"learning_rate": 0.001,
|
| 9700 |
+
"loss": 2.5991,
|
| 9701 |
+
"num_input_tokens_seen": 14286843456,
|
| 9702 |
+
"step": 54500
|
| 9703 |
+
},
|
| 9704 |
+
{
|
| 9705 |
+
"epoch": 0.25996637132260875,
|
| 9706 |
+
"eval_loss": 2.4861645698547363,
|
| 9707 |
+
"eval_runtime": 53.6448,
|
| 9708 |
+
"eval_samples_per_second": 93.206,
|
| 9709 |
+
"eval_steps_per_second": 23.301,
|
| 9710 |
+
"num_input_tokens_seen": 14286843456,
|
| 9711 |
+
"step": 54500
|
| 9712 |
+
},
|
| 9713 |
+
{
|
| 9714 |
+
"epoch": 0.26020487258070285,
|
| 9715 |
+
"grad_norm": 0.20462968945503235,
|
| 9716 |
+
"learning_rate": 0.001,
|
| 9717 |
+
"loss": 2.5887,
|
| 9718 |
+
"num_input_tokens_seen": 14299950656,
|
| 9719 |
+
"step": 54550
|
| 9720 |
+
},
|
| 9721 |
+
{
|
| 9722 |
+
"epoch": 0.260443373838797,
|
| 9723 |
+
"grad_norm": 0.20952938497066498,
|
| 9724 |
+
"learning_rate": 0.001,
|
| 9725 |
+
"loss": 2.608,
|
| 9726 |
+
"num_input_tokens_seen": 14313057856,
|
| 9727 |
+
"step": 54600
|
| 9728 |
+
},
|
| 9729 |
+
{
|
| 9730 |
+
"epoch": 0.2606818750968911,
|
| 9731 |
+
"grad_norm": 0.2095402032136917,
|
| 9732 |
+
"learning_rate": 0.001,
|
| 9733 |
+
"loss": 2.6079,
|
| 9734 |
+
"num_input_tokens_seen": 14326165056,
|
| 9735 |
+
"step": 54650
|
| 9736 |
+
},
|
| 9737 |
+
{
|
| 9738 |
+
"epoch": 0.2609203763549853,
|
| 9739 |
+
"grad_norm": 0.2343517541885376,
|
| 9740 |
+
"learning_rate": 0.001,
|
| 9741 |
+
"loss": 2.6124,
|
| 9742 |
+
"num_input_tokens_seen": 14339272256,
|
| 9743 |
+
"step": 54700
|
| 9744 |
+
},
|
| 9745 |
+
{
|
| 9746 |
+
"epoch": 0.26115887761307943,
|
| 9747 |
+
"grad_norm": 0.23840700089931488,
|
| 9748 |
+
"learning_rate": 0.001,
|
| 9749 |
+
"loss": 2.6015,
|
| 9750 |
+
"num_input_tokens_seen": 14352379456,
|
| 9751 |
+
"step": 54750
|
| 9752 |
+
},
|
| 9753 |
+
{
|
| 9754 |
+
"epoch": 0.26139737887117354,
|
| 9755 |
+
"grad_norm": 0.22024671733379364,
|
| 9756 |
+
"learning_rate": 0.001,
|
| 9757 |
+
"loss": 2.5812,
|
| 9758 |
+
"num_input_tokens_seen": 14365486656,
|
| 9759 |
+
"step": 54800
|
| 9760 |
+
},
|
| 9761 |
+
{
|
| 9762 |
+
"epoch": 0.2616358801292677,
|
| 9763 |
+
"grad_norm": 0.19884246587753296,
|
| 9764 |
+
"learning_rate": 0.001,
|
| 9765 |
+
"loss": 2.6118,
|
| 9766 |
+
"num_input_tokens_seen": 14378593856,
|
| 9767 |
+
"step": 54850
|
| 9768 |
+
},
|
| 9769 |
+
{
|
| 9770 |
+
"epoch": 0.2618743813873618,
|
| 9771 |
+
"grad_norm": 0.46560585498809814,
|
| 9772 |
+
"learning_rate": 0.001,
|
| 9773 |
+
"loss": 2.6024,
|
| 9774 |
+
"num_input_tokens_seen": 14391701056,
|
| 9775 |
+
"step": 54900
|
| 9776 |
+
},
|
| 9777 |
+
{
|
| 9778 |
+
"epoch": 0.26211288264545596,
|
| 9779 |
+
"grad_norm": 0.2956256568431854,
|
| 9780 |
+
"learning_rate": 0.001,
|
| 9781 |
+
"loss": 2.6073,
|
| 9782 |
+
"num_input_tokens_seen": 14404808256,
|
| 9783 |
+
"step": 54950
|
| 9784 |
+
},
|
| 9785 |
+
{
|
| 9786 |
+
"epoch": 0.2623513839035501,
|
| 9787 |
+
"grad_norm": 0.286327064037323,
|
| 9788 |
+
"learning_rate": 0.001,
|
| 9789 |
+
"loss": 2.5946,
|
| 9790 |
+
"num_input_tokens_seen": 14417915456,
|
| 9791 |
+
"step": 55000
|
| 9792 |
+
},
|
| 9793 |
+
{
|
| 9794 |
+
"epoch": 0.2623513839035501,
|
| 9795 |
+
"eval_loss": 2.4892399311065674,
|
| 9796 |
+
"eval_runtime": 53.3184,
|
| 9797 |
+
"eval_samples_per_second": 93.776,
|
| 9798 |
+
"eval_steps_per_second": 23.444,
|
| 9799 |
+
"num_input_tokens_seen": 14417915456,
|
| 9800 |
+
"step": 55000
|
| 9801 |
}
|
| 9802 |
],
|
| 9803 |
"logging_steps": 50,
|
| 9804 |
"max_steps": 70000,
|
| 9805 |
+
"num_input_tokens_seen": 14417915456,
|
| 9806 |
"num_train_epochs": 1,
|
| 9807 |
"save_steps": 1000,
|
| 9808 |
"stateful_callbacks": {
|
|
|
|
| 9817 |
"attributes": {}
|
| 9818 |
}
|
| 9819 |
},
|
| 9820 |
+
"total_flos": 3.8569331033348506e+18,
|
| 9821 |
"train_batch_size": 64,
|
| 9822 |
"trial_name": null,
|
| 9823 |
"trial_params": null
|