Training in progress, step 57000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:972aa91ec388a1f2f04b57475bbe0ef1d7a488751339adb89aa78c0871d0f22b
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f588ba0d0b39a0c0daf2cb6afacca8a7aef1f4bc72fe4409ce0b2281d2e356a
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93c5029373839975c8e2ce486239c3c93c8bcc84856a9726f25e6b39e80d4bdb
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba86940b99fa7512a6bd263e7bdaf7ba94fc8e695324bdfda4c03882f64aa78d
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf8627c515e0a9fd4095a16f3cf6f960eebbddd06bd5667ffafe332a0150e802
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55937ecae83bb1b9ebb2721682f64ea1aca1aefba9e61d245b7d516977f878f9
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:039c09879ba9a48ef7918776fd751a67234de8e6a37518ae707982e7427ed8c9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -19608,6 +19608,356 @@
|
|
| 19608 |
"learning_rate": 0.0004862947023007535,
|
| 19609 |
"loss": 17.0628,
|
| 19610 |
"step": 56000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19611 |
}
|
| 19612 |
],
|
| 19613 |
"logging_steps": 20,
|
|
@@ -19627,7 +19977,7 @@
|
|
| 19627 |
"attributes": {}
|
| 19628 |
}
|
| 19629 |
},
|
| 19630 |
-
"total_flos": 4.
|
| 19631 |
"train_batch_size": 48,
|
| 19632 |
"trial_name": null,
|
| 19633 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.08443493769590386,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 57000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 19608 |
"learning_rate": 0.0004862947023007535,
|
| 19609 |
"loss": 17.0628,
|
| 19610 |
"step": 56000
|
| 19611 |
+
},
|
| 19612 |
+
{
|
| 19613 |
+
"epoch": 0.08298324929341289,
|
| 19614 |
+
"grad_norm": 6.6875,
|
| 19615 |
+
"learning_rate": 0.00048628976336574593,
|
| 19616 |
+
"loss": 17.0877,
|
| 19617 |
+
"step": 56020
|
| 19618 |
+
},
|
| 19619 |
+
{
|
| 19620 |
+
"epoch": 0.08301287558734127,
|
| 19621 |
+
"grad_norm": 6.78125,
|
| 19622 |
+
"learning_rate": 0.0004862848244307383,
|
| 19623 |
+
"loss": 17.0776,
|
| 19624 |
+
"step": 56040
|
| 19625 |
+
},
|
| 19626 |
+
{
|
| 19627 |
+
"epoch": 0.08304250188126966,
|
| 19628 |
+
"grad_norm": 8.25,
|
| 19629 |
+
"learning_rate": 0.0004862798854957308,
|
| 19630 |
+
"loss": 17.0322,
|
| 19631 |
+
"step": 56060
|
| 19632 |
+
},
|
| 19633 |
+
{
|
| 19634 |
+
"epoch": 0.08307212817519805,
|
| 19635 |
+
"grad_norm": 6.5,
|
| 19636 |
+
"learning_rate": 0.0004862749465607232,
|
| 19637 |
+
"loss": 17.0593,
|
| 19638 |
+
"step": 56080
|
| 19639 |
+
},
|
| 19640 |
+
{
|
| 19641 |
+
"epoch": 0.08310175446912645,
|
| 19642 |
+
"grad_norm": 7.0,
|
| 19643 |
+
"learning_rate": 0.00048627000762571567,
|
| 19644 |
+
"loss": 17.0977,
|
| 19645 |
+
"step": 56100
|
| 19646 |
+
},
|
| 19647 |
+
{
|
| 19648 |
+
"epoch": 0.08313138076305483,
|
| 19649 |
+
"grad_norm": 6.21875,
|
| 19650 |
+
"learning_rate": 0.00048626506869070806,
|
| 19651 |
+
"loss": 17.0193,
|
| 19652 |
+
"step": 56120
|
| 19653 |
+
},
|
| 19654 |
+
{
|
| 19655 |
+
"epoch": 0.08316100705698322,
|
| 19656 |
+
"grad_norm": 7.09375,
|
| 19657 |
+
"learning_rate": 0.00048626012975570057,
|
| 19658 |
+
"loss": 17.0925,
|
| 19659 |
+
"step": 56140
|
| 19660 |
+
},
|
| 19661 |
+
{
|
| 19662 |
+
"epoch": 0.0831906333509116,
|
| 19663 |
+
"grad_norm": 7.5,
|
| 19664 |
+
"learning_rate": 0.00048625519082069296,
|
| 19665 |
+
"loss": 17.04,
|
| 19666 |
+
"step": 56160
|
| 19667 |
+
},
|
| 19668 |
+
{
|
| 19669 |
+
"epoch": 0.08322025964483999,
|
| 19670 |
+
"grad_norm": 7.40625,
|
| 19671 |
+
"learning_rate": 0.0004862502518856854,
|
| 19672 |
+
"loss": 17.1382,
|
| 19673 |
+
"step": 56180
|
| 19674 |
+
},
|
| 19675 |
+
{
|
| 19676 |
+
"epoch": 0.08324988593876838,
|
| 19677 |
+
"grad_norm": 7.34375,
|
| 19678 |
+
"learning_rate": 0.0004862453129506778,
|
| 19679 |
+
"loss": 17.0622,
|
| 19680 |
+
"step": 56200
|
| 19681 |
+
},
|
| 19682 |
+
{
|
| 19683 |
+
"epoch": 0.08327951223269676,
|
| 19684 |
+
"grad_norm": 8.125,
|
| 19685 |
+
"learning_rate": 0.0004862403740156703,
|
| 19686 |
+
"loss": 17.0424,
|
| 19687 |
+
"step": 56220
|
| 19688 |
+
},
|
| 19689 |
+
{
|
| 19690 |
+
"epoch": 0.08330913852662515,
|
| 19691 |
+
"grad_norm": 6.96875,
|
| 19692 |
+
"learning_rate": 0.0004862354350806627,
|
| 19693 |
+
"loss": 17.0722,
|
| 19694 |
+
"step": 56240
|
| 19695 |
+
},
|
| 19696 |
+
{
|
| 19697 |
+
"epoch": 0.08333876482055354,
|
| 19698 |
+
"grad_norm": 6.9375,
|
| 19699 |
+
"learning_rate": 0.00048623049614565514,
|
| 19700 |
+
"loss": 17.036,
|
| 19701 |
+
"step": 56260
|
| 19702 |
+
},
|
| 19703 |
+
{
|
| 19704 |
+
"epoch": 0.08336839111448192,
|
| 19705 |
+
"grad_norm": 6.78125,
|
| 19706 |
+
"learning_rate": 0.00048622555721064754,
|
| 19707 |
+
"loss": 17.0758,
|
| 19708 |
+
"step": 56280
|
| 19709 |
+
},
|
| 19710 |
+
{
|
| 19711 |
+
"epoch": 0.08339801740841031,
|
| 19712 |
+
"grad_norm": 6.6875,
|
| 19713 |
+
"learning_rate": 0.00048622061827564004,
|
| 19714 |
+
"loss": 17.1354,
|
| 19715 |
+
"step": 56300
|
| 19716 |
+
},
|
| 19717 |
+
{
|
| 19718 |
+
"epoch": 0.0834276437023387,
|
| 19719 |
+
"grad_norm": 6.9375,
|
| 19720 |
+
"learning_rate": 0.00048621567934063243,
|
| 19721 |
+
"loss": 17.1577,
|
| 19722 |
+
"step": 56320
|
| 19723 |
+
},
|
| 19724 |
+
{
|
| 19725 |
+
"epoch": 0.08345726999626708,
|
| 19726 |
+
"grad_norm": 6.46875,
|
| 19727 |
+
"learning_rate": 0.0004862107404056248,
|
| 19728 |
+
"loss": 17.0566,
|
| 19729 |
+
"step": 56340
|
| 19730 |
+
},
|
| 19731 |
+
{
|
| 19732 |
+
"epoch": 0.08348689629019547,
|
| 19733 |
+
"grad_norm": 7.40625,
|
| 19734 |
+
"learning_rate": 0.0004862058014706173,
|
| 19735 |
+
"loss": 17.0633,
|
| 19736 |
+
"step": 56360
|
| 19737 |
+
},
|
| 19738 |
+
{
|
| 19739 |
+
"epoch": 0.08351652258412386,
|
| 19740 |
+
"grad_norm": 7.0,
|
| 19741 |
+
"learning_rate": 0.0004862008625356097,
|
| 19742 |
+
"loss": 17.0427,
|
| 19743 |
+
"step": 56380
|
| 19744 |
+
},
|
| 19745 |
+
{
|
| 19746 |
+
"epoch": 0.08354614887805224,
|
| 19747 |
+
"grad_norm": 6.875,
|
| 19748 |
+
"learning_rate": 0.00048619592360060217,
|
| 19749 |
+
"loss": 17.068,
|
| 19750 |
+
"step": 56400
|
| 19751 |
+
},
|
| 19752 |
+
{
|
| 19753 |
+
"epoch": 0.08357577517198064,
|
| 19754 |
+
"grad_norm": 7.71875,
|
| 19755 |
+
"learning_rate": 0.00048619098466559456,
|
| 19756 |
+
"loss": 17.0289,
|
| 19757 |
+
"step": 56420
|
| 19758 |
+
},
|
| 19759 |
+
{
|
| 19760 |
+
"epoch": 0.08360540146590903,
|
| 19761 |
+
"grad_norm": 7.21875,
|
| 19762 |
+
"learning_rate": 0.00048618604573058707,
|
| 19763 |
+
"loss": 17.0595,
|
| 19764 |
+
"step": 56440
|
| 19765 |
+
},
|
| 19766 |
+
{
|
| 19767 |
+
"epoch": 0.08363502775983742,
|
| 19768 |
+
"grad_norm": 6.84375,
|
| 19769 |
+
"learning_rate": 0.00048618110679557946,
|
| 19770 |
+
"loss": 17.0464,
|
| 19771 |
+
"step": 56460
|
| 19772 |
+
},
|
| 19773 |
+
{
|
| 19774 |
+
"epoch": 0.0836646540537658,
|
| 19775 |
+
"grad_norm": 6.875,
|
| 19776 |
+
"learning_rate": 0.0004861761678605719,
|
| 19777 |
+
"loss": 17.0377,
|
| 19778 |
+
"step": 56480
|
| 19779 |
+
},
|
| 19780 |
+
{
|
| 19781 |
+
"epoch": 0.08369428034769419,
|
| 19782 |
+
"grad_norm": 6.90625,
|
| 19783 |
+
"learning_rate": 0.0004861712289255643,
|
| 19784 |
+
"loss": 17.1083,
|
| 19785 |
+
"step": 56500
|
| 19786 |
+
},
|
| 19787 |
+
{
|
| 19788 |
+
"epoch": 0.08372390664162258,
|
| 19789 |
+
"grad_norm": 6.4375,
|
| 19790 |
+
"learning_rate": 0.0004861662899905568,
|
| 19791 |
+
"loss": 17.0156,
|
| 19792 |
+
"step": 56520
|
| 19793 |
+
},
|
| 19794 |
+
{
|
| 19795 |
+
"epoch": 0.08375353293555096,
|
| 19796 |
+
"grad_norm": 7.21875,
|
| 19797 |
+
"learning_rate": 0.0004861613510555492,
|
| 19798 |
+
"loss": 17.1009,
|
| 19799 |
+
"step": 56540
|
| 19800 |
+
},
|
| 19801 |
+
{
|
| 19802 |
+
"epoch": 0.08378315922947935,
|
| 19803 |
+
"grad_norm": 7.09375,
|
| 19804 |
+
"learning_rate": 0.00048615641212054164,
|
| 19805 |
+
"loss": 17.0541,
|
| 19806 |
+
"step": 56560
|
| 19807 |
+
},
|
| 19808 |
+
{
|
| 19809 |
+
"epoch": 0.08381278552340773,
|
| 19810 |
+
"grad_norm": 6.875,
|
| 19811 |
+
"learning_rate": 0.00048615147318553404,
|
| 19812 |
+
"loss": 17.0052,
|
| 19813 |
+
"step": 56580
|
| 19814 |
+
},
|
| 19815 |
+
{
|
| 19816 |
+
"epoch": 0.08384241181733612,
|
| 19817 |
+
"grad_norm": 6.78125,
|
| 19818 |
+
"learning_rate": 0.00048614653425052654,
|
| 19819 |
+
"loss": 17.0689,
|
| 19820 |
+
"step": 56600
|
| 19821 |
+
},
|
| 19822 |
+
{
|
| 19823 |
+
"epoch": 0.08387203811126451,
|
| 19824 |
+
"grad_norm": 6.65625,
|
| 19825 |
+
"learning_rate": 0.00048614159531551893,
|
| 19826 |
+
"loss": 17.0825,
|
| 19827 |
+
"step": 56620
|
| 19828 |
+
},
|
| 19829 |
+
{
|
| 19830 |
+
"epoch": 0.0839016644051929,
|
| 19831 |
+
"grad_norm": 6.90625,
|
| 19832 |
+
"learning_rate": 0.0004861366563805114,
|
| 19833 |
+
"loss": 17.0745,
|
| 19834 |
+
"step": 56640
|
| 19835 |
+
},
|
| 19836 |
+
{
|
| 19837 |
+
"epoch": 0.08393129069912128,
|
| 19838 |
+
"grad_norm": 6.375,
|
| 19839 |
+
"learning_rate": 0.0004861317174455038,
|
| 19840 |
+
"loss": 17.0943,
|
| 19841 |
+
"step": 56660
|
| 19842 |
+
},
|
| 19843 |
+
{
|
| 19844 |
+
"epoch": 0.08396091699304967,
|
| 19845 |
+
"grad_norm": 7.28125,
|
| 19846 |
+
"learning_rate": 0.0004861267785104962,
|
| 19847 |
+
"loss": 16.9957,
|
| 19848 |
+
"step": 56680
|
| 19849 |
+
},
|
| 19850 |
+
{
|
| 19851 |
+
"epoch": 0.08399054328697805,
|
| 19852 |
+
"grad_norm": 7.15625,
|
| 19853 |
+
"learning_rate": 0.00048612183957548867,
|
| 19854 |
+
"loss": 17.0411,
|
| 19855 |
+
"step": 56700
|
| 19856 |
+
},
|
| 19857 |
+
{
|
| 19858 |
+
"epoch": 0.08402016958090644,
|
| 19859 |
+
"grad_norm": 7.25,
|
| 19860 |
+
"learning_rate": 0.00048611690064048106,
|
| 19861 |
+
"loss": 17.0705,
|
| 19862 |
+
"step": 56720
|
| 19863 |
+
},
|
| 19864 |
+
{
|
| 19865 |
+
"epoch": 0.08404979587483484,
|
| 19866 |
+
"grad_norm": 7.15625,
|
| 19867 |
+
"learning_rate": 0.00048611196170547357,
|
| 19868 |
+
"loss": 17.0302,
|
| 19869 |
+
"step": 56740
|
| 19870 |
+
},
|
| 19871 |
+
{
|
| 19872 |
+
"epoch": 0.08407942216876323,
|
| 19873 |
+
"grad_norm": 7.25,
|
| 19874 |
+
"learning_rate": 0.00048610702277046596,
|
| 19875 |
+
"loss": 17.0697,
|
| 19876 |
+
"step": 56760
|
| 19877 |
+
},
|
| 19878 |
+
{
|
| 19879 |
+
"epoch": 0.08410904846269161,
|
| 19880 |
+
"grad_norm": 7.0625,
|
| 19881 |
+
"learning_rate": 0.0004861020838354584,
|
| 19882 |
+
"loss": 16.9995,
|
| 19883 |
+
"step": 56780
|
| 19884 |
+
},
|
| 19885 |
+
{
|
| 19886 |
+
"epoch": 0.08413867475662,
|
| 19887 |
+
"grad_norm": 6.625,
|
| 19888 |
+
"learning_rate": 0.0004860971449004508,
|
| 19889 |
+
"loss": 17.0243,
|
| 19890 |
+
"step": 56800
|
| 19891 |
+
},
|
| 19892 |
+
{
|
| 19893 |
+
"epoch": 0.08416830105054839,
|
| 19894 |
+
"grad_norm": 7.1875,
|
| 19895 |
+
"learning_rate": 0.0004860922059654433,
|
| 19896 |
+
"loss": 17.0269,
|
| 19897 |
+
"step": 56820
|
| 19898 |
+
},
|
| 19899 |
+
{
|
| 19900 |
+
"epoch": 0.08419792734447677,
|
| 19901 |
+
"grad_norm": 7.1875,
|
| 19902 |
+
"learning_rate": 0.0004860872670304357,
|
| 19903 |
+
"loss": 17.095,
|
| 19904 |
+
"step": 56840
|
| 19905 |
+
},
|
| 19906 |
+
{
|
| 19907 |
+
"epoch": 0.08422755363840516,
|
| 19908 |
+
"grad_norm": 6.375,
|
| 19909 |
+
"learning_rate": 0.00048608232809542814,
|
| 19910 |
+
"loss": 16.9837,
|
| 19911 |
+
"step": 56860
|
| 19912 |
+
},
|
| 19913 |
+
{
|
| 19914 |
+
"epoch": 0.08425717993233355,
|
| 19915 |
+
"grad_norm": 6.40625,
|
| 19916 |
+
"learning_rate": 0.00048607738916042054,
|
| 19917 |
+
"loss": 17.0325,
|
| 19918 |
+
"step": 56880
|
| 19919 |
+
},
|
| 19920 |
+
{
|
| 19921 |
+
"epoch": 0.08428680622626193,
|
| 19922 |
+
"grad_norm": 6.5625,
|
| 19923 |
+
"learning_rate": 0.00048607245022541304,
|
| 19924 |
+
"loss": 17.0513,
|
| 19925 |
+
"step": 56900
|
| 19926 |
+
},
|
| 19927 |
+
{
|
| 19928 |
+
"epoch": 0.08431643252019032,
|
| 19929 |
+
"grad_norm": 6.71875,
|
| 19930 |
+
"learning_rate": 0.00048606751129040543,
|
| 19931 |
+
"loss": 17.0624,
|
| 19932 |
+
"step": 56920
|
| 19933 |
+
},
|
| 19934 |
+
{
|
| 19935 |
+
"epoch": 0.0843460588141187,
|
| 19936 |
+
"grad_norm": 6.8125,
|
| 19937 |
+
"learning_rate": 0.0004860625723553979,
|
| 19938 |
+
"loss": 17.0052,
|
| 19939 |
+
"step": 56940
|
| 19940 |
+
},
|
| 19941 |
+
{
|
| 19942 |
+
"epoch": 0.08437568510804709,
|
| 19943 |
+
"grad_norm": 7.09375,
|
| 19944 |
+
"learning_rate": 0.0004860576334203903,
|
| 19945 |
+
"loss": 17.0705,
|
| 19946 |
+
"step": 56960
|
| 19947 |
+
},
|
| 19948 |
+
{
|
| 19949 |
+
"epoch": 0.08440531140197548,
|
| 19950 |
+
"grad_norm": 7.0625,
|
| 19951 |
+
"learning_rate": 0.0004860526944853828,
|
| 19952 |
+
"loss": 16.9932,
|
| 19953 |
+
"step": 56980
|
| 19954 |
+
},
|
| 19955 |
+
{
|
| 19956 |
+
"epoch": 0.08443493769590386,
|
| 19957 |
+
"grad_norm": 6.9375,
|
| 19958 |
+
"learning_rate": 0.00048604775555037517,
|
| 19959 |
+
"loss": 16.9712,
|
| 19960 |
+
"step": 57000
|
| 19961 |
}
|
| 19962 |
],
|
| 19963 |
"logging_steps": 20,
|
|
|
|
| 19977 |
"attributes": {}
|
| 19978 |
}
|
| 19979 |
},
|
| 19980 |
+
"total_flos": 4.190665024641329e+19,
|
| 19981 |
"train_batch_size": 48,
|
| 19982 |
"trial_name": null,
|
| 19983 |
"trial_params": null
|