Training in progress, step 20000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db7432141f78eaf89762dbfa7cd270e9a33828df0e033550b34c9481463227a6
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5277a64b453be07385d84e9f80db45a50e37f4890167c3e4c572e1a6fb7bdaaa
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baf78593c218b20d298480993c7fbaf9b2ea100e2a22749e83c5c1aba18f3f4c
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65689efd51e6068aa6422e7737ef0148b7583a59986d4d53a6a0a02103bfcb11
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90981e208c884dfa861b8ec3fc9badb69e05a78f261183a623615ac5a97c3c95
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c76af8694f8d37feee42992c1a0000126f33879d8755e31713c98eb2fdb7b48c
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5601fb269352a3de217d5b9fa42e25567fee4127194adcf0f48431818665f1d8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6658,6 +6658,356 @@
|
|
| 6658 |
"learning_rate": 0.0004939859465103925,
|
| 6659 |
"loss": 19.6594,
|
| 6660 |
"step": 19000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6661 |
}
|
| 6662 |
],
|
| 6663 |
"logging_steps": 20,
|
|
@@ -6677,7 +7027,7 @@
|
|
| 6677 |
"attributes": {}
|
| 6678 |
}
|
| 6679 |
},
|
| 6680 |
-
"total_flos": 1.
|
| 6681 |
"train_batch_size": 48,
|
| 6682 |
"trial_name": null,
|
| 6683 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.03899946863223989,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 20000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6658 |
"learning_rate": 0.0004939859465103925,
|
| 6659 |
"loss": 19.6594,
|
| 6660 |
"step": 19000
|
| 6661 |
+
},
|
| 6662 |
+
{
|
| 6663 |
+
"epoch": 0.03708849466926013,
|
| 6664 |
+
"grad_norm": 11.0625,
|
| 6665 |
+
"learning_rate": 0.0004939794444794831,
|
| 6666 |
+
"loss": 19.7557,
|
| 6667 |
+
"step": 19020
|
| 6668 |
+
},
|
| 6669 |
+
{
|
| 6670 |
+
"epoch": 0.03712749413789237,
|
| 6671 |
+
"grad_norm": 8.5,
|
| 6672 |
+
"learning_rate": 0.0004939729424485738,
|
| 6673 |
+
"loss": 19.6508,
|
| 6674 |
+
"step": 19040
|
| 6675 |
+
},
|
| 6676 |
+
{
|
| 6677 |
+
"epoch": 0.03716649360652461,
|
| 6678 |
+
"grad_norm": 12.375,
|
| 6679 |
+
"learning_rate": 0.0004939664404176644,
|
| 6680 |
+
"loss": 19.6447,
|
| 6681 |
+
"step": 19060
|
| 6682 |
+
},
|
| 6683 |
+
{
|
| 6684 |
+
"epoch": 0.03720549307515685,
|
| 6685 |
+
"grad_norm": 9.5625,
|
| 6686 |
+
"learning_rate": 0.0004939599383867551,
|
| 6687 |
+
"loss": 19.5852,
|
| 6688 |
+
"step": 19080
|
| 6689 |
+
},
|
| 6690 |
+
{
|
| 6691 |
+
"epoch": 0.037244492543789094,
|
| 6692 |
+
"grad_norm": 9.5625,
|
| 6693 |
+
"learning_rate": 0.0004939534363558458,
|
| 6694 |
+
"loss": 19.6278,
|
| 6695 |
+
"step": 19100
|
| 6696 |
+
},
|
| 6697 |
+
{
|
| 6698 |
+
"epoch": 0.03728349201242133,
|
| 6699 |
+
"grad_norm": 11.25,
|
| 6700 |
+
"learning_rate": 0.0004939469343249364,
|
| 6701 |
+
"loss": 19.7012,
|
| 6702 |
+
"step": 19120
|
| 6703 |
+
},
|
| 6704 |
+
{
|
| 6705 |
+
"epoch": 0.03732249148105357,
|
| 6706 |
+
"grad_norm": 10.9375,
|
| 6707 |
+
"learning_rate": 0.0004939404322940271,
|
| 6708 |
+
"loss": 19.6584,
|
| 6709 |
+
"step": 19140
|
| 6710 |
+
},
|
| 6711 |
+
{
|
| 6712 |
+
"epoch": 0.03736149094968581,
|
| 6713 |
+
"grad_norm": 10.375,
|
| 6714 |
+
"learning_rate": 0.0004939339302631176,
|
| 6715 |
+
"loss": 19.6233,
|
| 6716 |
+
"step": 19160
|
| 6717 |
+
},
|
| 6718 |
+
{
|
| 6719 |
+
"epoch": 0.03740049041831805,
|
| 6720 |
+
"grad_norm": 11.625,
|
| 6721 |
+
"learning_rate": 0.0004939274282322083,
|
| 6722 |
+
"loss": 19.5714,
|
| 6723 |
+
"step": 19180
|
| 6724 |
+
},
|
| 6725 |
+
{
|
| 6726 |
+
"epoch": 0.03743948988695029,
|
| 6727 |
+
"grad_norm": 10.0,
|
| 6728 |
+
"learning_rate": 0.0004939209262012989,
|
| 6729 |
+
"loss": 19.5819,
|
| 6730 |
+
"step": 19200
|
| 6731 |
+
},
|
| 6732 |
+
{
|
| 6733 |
+
"epoch": 0.03747848935558253,
|
| 6734 |
+
"grad_norm": 10.375,
|
| 6735 |
+
"learning_rate": 0.0004939144241703896,
|
| 6736 |
+
"loss": 19.605,
|
| 6737 |
+
"step": 19220
|
| 6738 |
+
},
|
| 6739 |
+
{
|
| 6740 |
+
"epoch": 0.03751748882421477,
|
| 6741 |
+
"grad_norm": 10.6875,
|
| 6742 |
+
"learning_rate": 0.0004939079221394803,
|
| 6743 |
+
"loss": 19.6348,
|
| 6744 |
+
"step": 19240
|
| 6745 |
+
},
|
| 6746 |
+
{
|
| 6747 |
+
"epoch": 0.03755648829284701,
|
| 6748 |
+
"grad_norm": 9.3125,
|
| 6749 |
+
"learning_rate": 0.0004939014201085709,
|
| 6750 |
+
"loss": 19.6152,
|
| 6751 |
+
"step": 19260
|
| 6752 |
+
},
|
| 6753 |
+
{
|
| 6754 |
+
"epoch": 0.03759548776147925,
|
| 6755 |
+
"grad_norm": 10.1875,
|
| 6756 |
+
"learning_rate": 0.0004938949180776616,
|
| 6757 |
+
"loss": 19.6556,
|
| 6758 |
+
"step": 19280
|
| 6759 |
+
},
|
| 6760 |
+
{
|
| 6761 |
+
"epoch": 0.03763448723011149,
|
| 6762 |
+
"grad_norm": 9.125,
|
| 6763 |
+
"learning_rate": 0.0004938884160467522,
|
| 6764 |
+
"loss": 19.603,
|
| 6765 |
+
"step": 19300
|
| 6766 |
+
},
|
| 6767 |
+
{
|
| 6768 |
+
"epoch": 0.03767348669874373,
|
| 6769 |
+
"grad_norm": 10.0625,
|
| 6770 |
+
"learning_rate": 0.0004938819140158429,
|
| 6771 |
+
"loss": 19.6083,
|
| 6772 |
+
"step": 19320
|
| 6773 |
+
},
|
| 6774 |
+
{
|
| 6775 |
+
"epoch": 0.03771248616737597,
|
| 6776 |
+
"grad_norm": 11.8125,
|
| 6777 |
+
"learning_rate": 0.0004938754119849335,
|
| 6778 |
+
"loss": 19.6945,
|
| 6779 |
+
"step": 19340
|
| 6780 |
+
},
|
| 6781 |
+
{
|
| 6782 |
+
"epoch": 0.03775148563600821,
|
| 6783 |
+
"grad_norm": 10.125,
|
| 6784 |
+
"learning_rate": 0.0004938689099540242,
|
| 6785 |
+
"loss": 19.5438,
|
| 6786 |
+
"step": 19360
|
| 6787 |
+
},
|
| 6788 |
+
{
|
| 6789 |
+
"epoch": 0.03779048510464045,
|
| 6790 |
+
"grad_norm": 9.5625,
|
| 6791 |
+
"learning_rate": 0.0004938624079231149,
|
| 6792 |
+
"loss": 19.6158,
|
| 6793 |
+
"step": 19380
|
| 6794 |
+
},
|
| 6795 |
+
{
|
| 6796 |
+
"epoch": 0.03782948457327269,
|
| 6797 |
+
"grad_norm": 10.1875,
|
| 6798 |
+
"learning_rate": 0.0004938559058922054,
|
| 6799 |
+
"loss": 19.6023,
|
| 6800 |
+
"step": 19400
|
| 6801 |
+
},
|
| 6802 |
+
{
|
| 6803 |
+
"epoch": 0.03786848404190493,
|
| 6804 |
+
"grad_norm": 9.75,
|
| 6805 |
+
"learning_rate": 0.0004938494038612961,
|
| 6806 |
+
"loss": 19.6143,
|
| 6807 |
+
"step": 19420
|
| 6808 |
+
},
|
| 6809 |
+
{
|
| 6810 |
+
"epoch": 0.03790748351053717,
|
| 6811 |
+
"grad_norm": 9.875,
|
| 6812 |
+
"learning_rate": 0.0004938429018303867,
|
| 6813 |
+
"loss": 19.5367,
|
| 6814 |
+
"step": 19440
|
| 6815 |
+
},
|
| 6816 |
+
{
|
| 6817 |
+
"epoch": 0.03794648297916941,
|
| 6818 |
+
"grad_norm": 9.375,
|
| 6819 |
+
"learning_rate": 0.0004938363997994774,
|
| 6820 |
+
"loss": 19.5761,
|
| 6821 |
+
"step": 19460
|
| 6822 |
+
},
|
| 6823 |
+
{
|
| 6824 |
+
"epoch": 0.03798548244780165,
|
| 6825 |
+
"grad_norm": 10.1875,
|
| 6826 |
+
"learning_rate": 0.000493829897768568,
|
| 6827 |
+
"loss": 19.5939,
|
| 6828 |
+
"step": 19480
|
| 6829 |
+
},
|
| 6830 |
+
{
|
| 6831 |
+
"epoch": 0.038024481916433886,
|
| 6832 |
+
"grad_norm": 10.0,
|
| 6833 |
+
"learning_rate": 0.0004938233957376587,
|
| 6834 |
+
"loss": 19.5595,
|
| 6835 |
+
"step": 19500
|
| 6836 |
+
},
|
| 6837 |
+
{
|
| 6838 |
+
"epoch": 0.03806348138506613,
|
| 6839 |
+
"grad_norm": 12.75,
|
| 6840 |
+
"learning_rate": 0.0004938168937067493,
|
| 6841 |
+
"loss": 19.5722,
|
| 6842 |
+
"step": 19520
|
| 6843 |
+
},
|
| 6844 |
+
{
|
| 6845 |
+
"epoch": 0.03810248085369837,
|
| 6846 |
+
"grad_norm": 10.375,
|
| 6847 |
+
"learning_rate": 0.00049381039167584,
|
| 6848 |
+
"loss": 19.5889,
|
| 6849 |
+
"step": 19540
|
| 6850 |
+
},
|
| 6851 |
+
{
|
| 6852 |
+
"epoch": 0.038141480322330605,
|
| 6853 |
+
"grad_norm": 10.5,
|
| 6854 |
+
"learning_rate": 0.0004938038896449307,
|
| 6855 |
+
"loss": 19.5379,
|
| 6856 |
+
"step": 19560
|
| 6857 |
+
},
|
| 6858 |
+
{
|
| 6859 |
+
"epoch": 0.03818047979096285,
|
| 6860 |
+
"grad_norm": 9.875,
|
| 6861 |
+
"learning_rate": 0.0004937973876140213,
|
| 6862 |
+
"loss": 19.5243,
|
| 6863 |
+
"step": 19580
|
| 6864 |
+
},
|
| 6865 |
+
{
|
| 6866 |
+
"epoch": 0.03821947925959509,
|
| 6867 |
+
"grad_norm": 9.1875,
|
| 6868 |
+
"learning_rate": 0.0004937908855831119,
|
| 6869 |
+
"loss": 19.6067,
|
| 6870 |
+
"step": 19600
|
| 6871 |
+
},
|
| 6872 |
+
{
|
| 6873 |
+
"epoch": 0.03825847872822733,
|
| 6874 |
+
"grad_norm": 10.25,
|
| 6875 |
+
"learning_rate": 0.0004937843835522025,
|
| 6876 |
+
"loss": 19.6051,
|
| 6877 |
+
"step": 19620
|
| 6878 |
+
},
|
| 6879 |
+
{
|
| 6880 |
+
"epoch": 0.038297478196859566,
|
| 6881 |
+
"grad_norm": 10.5,
|
| 6882 |
+
"learning_rate": 0.0004937778815212932,
|
| 6883 |
+
"loss": 19.5555,
|
| 6884 |
+
"step": 19640
|
| 6885 |
+
},
|
| 6886 |
+
{
|
| 6887 |
+
"epoch": 0.03833647766549181,
|
| 6888 |
+
"grad_norm": 10.75,
|
| 6889 |
+
"learning_rate": 0.0004937713794903838,
|
| 6890 |
+
"loss": 19.601,
|
| 6891 |
+
"step": 19660
|
| 6892 |
+
},
|
| 6893 |
+
{
|
| 6894 |
+
"epoch": 0.03837547713412405,
|
| 6895 |
+
"grad_norm": 9.5,
|
| 6896 |
+
"learning_rate": 0.0004937648774594745,
|
| 6897 |
+
"loss": 19.5818,
|
| 6898 |
+
"step": 19680
|
| 6899 |
+
},
|
| 6900 |
+
{
|
| 6901 |
+
"epoch": 0.038414476602756285,
|
| 6902 |
+
"grad_norm": 10.625,
|
| 6903 |
+
"learning_rate": 0.0004937583754285651,
|
| 6904 |
+
"loss": 19.5565,
|
| 6905 |
+
"step": 19700
|
| 6906 |
+
},
|
| 6907 |
+
{
|
| 6908 |
+
"epoch": 0.03845347607138853,
|
| 6909 |
+
"grad_norm": 9.9375,
|
| 6910 |
+
"learning_rate": 0.0004937518733976558,
|
| 6911 |
+
"loss": 19.5719,
|
| 6912 |
+
"step": 19720
|
| 6913 |
+
},
|
| 6914 |
+
{
|
| 6915 |
+
"epoch": 0.03849247554002077,
|
| 6916 |
+
"grad_norm": 10.3125,
|
| 6917 |
+
"learning_rate": 0.0004937453713667465,
|
| 6918 |
+
"loss": 19.5583,
|
| 6919 |
+
"step": 19740
|
| 6920 |
+
},
|
| 6921 |
+
{
|
| 6922 |
+
"epoch": 0.038531475008653004,
|
| 6923 |
+
"grad_norm": 10.4375,
|
| 6924 |
+
"learning_rate": 0.000493738869335837,
|
| 6925 |
+
"loss": 19.5279,
|
| 6926 |
+
"step": 19760
|
| 6927 |
+
},
|
| 6928 |
+
{
|
| 6929 |
+
"epoch": 0.038570474477285246,
|
| 6930 |
+
"grad_norm": 9.4375,
|
| 6931 |
+
"learning_rate": 0.0004937323673049277,
|
| 6932 |
+
"loss": 19.4711,
|
| 6933 |
+
"step": 19780
|
| 6934 |
+
},
|
| 6935 |
+
{
|
| 6936 |
+
"epoch": 0.03860947394591749,
|
| 6937 |
+
"grad_norm": 11.9375,
|
| 6938 |
+
"learning_rate": 0.0004937258652740183,
|
| 6939 |
+
"loss": 19.5244,
|
| 6940 |
+
"step": 19800
|
| 6941 |
+
},
|
| 6942 |
+
{
|
| 6943 |
+
"epoch": 0.038648473414549724,
|
| 6944 |
+
"grad_norm": 9.8125,
|
| 6945 |
+
"learning_rate": 0.000493719363243109,
|
| 6946 |
+
"loss": 19.5652,
|
| 6947 |
+
"step": 19820
|
| 6948 |
+
},
|
| 6949 |
+
{
|
| 6950 |
+
"epoch": 0.038687472883181966,
|
| 6951 |
+
"grad_norm": 11.4375,
|
| 6952 |
+
"learning_rate": 0.0004937128612121996,
|
| 6953 |
+
"loss": 19.5002,
|
| 6954 |
+
"step": 19840
|
| 6955 |
+
},
|
| 6956 |
+
{
|
| 6957 |
+
"epoch": 0.03872647235181421,
|
| 6958 |
+
"grad_norm": 10.1875,
|
| 6959 |
+
"learning_rate": 0.0004937063591812903,
|
| 6960 |
+
"loss": 19.4699,
|
| 6961 |
+
"step": 19860
|
| 6962 |
+
},
|
| 6963 |
+
{
|
| 6964 |
+
"epoch": 0.03876547182044645,
|
| 6965 |
+
"grad_norm": 10.8125,
|
| 6966 |
+
"learning_rate": 0.000493699857150381,
|
| 6967 |
+
"loss": 19.486,
|
| 6968 |
+
"step": 19880
|
| 6969 |
+
},
|
| 6970 |
+
{
|
| 6971 |
+
"epoch": 0.038804471289078685,
|
| 6972 |
+
"grad_norm": 9.0625,
|
| 6973 |
+
"learning_rate": 0.0004936933551194716,
|
| 6974 |
+
"loss": 19.5072,
|
| 6975 |
+
"step": 19900
|
| 6976 |
+
},
|
| 6977 |
+
{
|
| 6978 |
+
"epoch": 0.03884347075771093,
|
| 6979 |
+
"grad_norm": 9.625,
|
| 6980 |
+
"learning_rate": 0.0004936868530885622,
|
| 6981 |
+
"loss": 19.434,
|
| 6982 |
+
"step": 19920
|
| 6983 |
+
},
|
| 6984 |
+
{
|
| 6985 |
+
"epoch": 0.03888247022634317,
|
| 6986 |
+
"grad_norm": 9.5,
|
| 6987 |
+
"learning_rate": 0.0004936803510576528,
|
| 6988 |
+
"loss": 19.4787,
|
| 6989 |
+
"step": 19940
|
| 6990 |
+
},
|
| 6991 |
+
{
|
| 6992 |
+
"epoch": 0.038921469694975404,
|
| 6993 |
+
"grad_norm": 10.0625,
|
| 6994 |
+
"learning_rate": 0.0004936738490267435,
|
| 6995 |
+
"loss": 19.4656,
|
| 6996 |
+
"step": 19960
|
| 6997 |
+
},
|
| 6998 |
+
{
|
| 6999 |
+
"epoch": 0.038960469163607646,
|
| 7000 |
+
"grad_norm": 9.125,
|
| 7001 |
+
"learning_rate": 0.0004936673469958341,
|
| 7002 |
+
"loss": 19.5858,
|
| 7003 |
+
"step": 19980
|
| 7004 |
+
},
|
| 7005 |
+
{
|
| 7006 |
+
"epoch": 0.03899946863223989,
|
| 7007 |
+
"grad_norm": 8.875,
|
| 7008 |
+
"learning_rate": 0.0004936608449649248,
|
| 7009 |
+
"loss": 19.5272,
|
| 7010 |
+
"step": 20000
|
| 7011 |
}
|
| 7012 |
],
|
| 7013 |
"logging_steps": 20,
|
|
|
|
| 7027 |
"attributes": {}
|
| 7028 |
}
|
| 7029 |
},
|
| 7030 |
+
"total_flos": 1.4703345011961889e+19,
|
| 7031 |
"train_batch_size": 48,
|
| 7032 |
"trial_name": null,
|
| 7033 |
"trial_params": null
|