Training in progress, step 72000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fbefc43fcc2f8bf8bb8522016041f2a9a7a1389e937a0c7f9efe740c9281e923
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e10ee0c90a6cc09cdc24b1085749ee192ca52841ac52349ee023c635a106f71
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a2f1706dfc950df47249e8d65d6df596c2f98887c24dba54cde743e4804d2cf
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:488c74f8a1dc2a7148ae3d9f18c7e9fcbb141512e2f149cd1d29674d054be2f3
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77448ddbc0e5f35d8ef3a4b1063eb25209d701957cc23b3671796af1520e431c
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3acb48030fde17938d59bf929c695a9b6dbd4fe2687e2cce76096a6e14351d6
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:514d743b09cdf67b5f7ccba0c67283da3d20aa73a759bcf5ebfccf66234e08c8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24858,6 +24858,356 @@
|
|
| 24858 |
"learning_rate": 0.00048259050104507866,
|
| 24859 |
"loss": 16.5599,
|
| 24860 |
"step": 71000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24861 |
}
|
| 24862 |
],
|
| 24863 |
"logging_steps": 20,
|
|
@@ -24877,7 +25227,7 @@
|
|
| 24877 |
"attributes": {}
|
| 24878 |
}
|
| 24879 |
},
|
| 24880 |
-
"total_flos": 5.
|
| 24881 |
"train_batch_size": 48,
|
| 24882 |
"trial_name": null,
|
| 24883 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.10665465814219437,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 72000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24858 |
"learning_rate": 0.00048259050104507866,
|
| 24859 |
"loss": 16.5599,
|
| 24860 |
"step": 71000
|
| 24861 |
+
},
|
| 24862 |
+
{
|
| 24863 |
+
"epoch": 0.10520296973970338,
|
| 24864 |
+
"grad_norm": 6.84375,
|
| 24865 |
+
"learning_rate": 0.0004825855621100711,
|
| 24866 |
+
"loss": 16.4962,
|
| 24867 |
+
"step": 71020
|
| 24868 |
+
},
|
| 24869 |
+
{
|
| 24870 |
+
"epoch": 0.10523259603363178,
|
| 24871 |
+
"grad_norm": 6.875,
|
| 24872 |
+
"learning_rate": 0.0004825806231750635,
|
| 24873 |
+
"loss": 16.4845,
|
| 24874 |
+
"step": 71040
|
| 24875 |
+
},
|
| 24876 |
+
{
|
| 24877 |
+
"epoch": 0.10526222232756016,
|
| 24878 |
+
"grad_norm": 6.71875,
|
| 24879 |
+
"learning_rate": 0.00048257568424005595,
|
| 24880 |
+
"loss": 16.5411,
|
| 24881 |
+
"step": 71060
|
| 24882 |
+
},
|
| 24883 |
+
{
|
| 24884 |
+
"epoch": 0.10529184862148855,
|
| 24885 |
+
"grad_norm": 6.65625,
|
| 24886 |
+
"learning_rate": 0.0004825707453050484,
|
| 24887 |
+
"loss": 16.5235,
|
| 24888 |
+
"step": 71080
|
| 24889 |
+
},
|
| 24890 |
+
{
|
| 24891 |
+
"epoch": 0.10532147491541693,
|
| 24892 |
+
"grad_norm": 6.375,
|
| 24893 |
+
"learning_rate": 0.00048256580637004085,
|
| 24894 |
+
"loss": 16.5079,
|
| 24895 |
+
"step": 71100
|
| 24896 |
+
},
|
| 24897 |
+
{
|
| 24898 |
+
"epoch": 0.10535110120934532,
|
| 24899 |
+
"grad_norm": 6.375,
|
| 24900 |
+
"learning_rate": 0.00048256086743503324,
|
| 24901 |
+
"loss": 16.5093,
|
| 24902 |
+
"step": 71120
|
| 24903 |
+
},
|
| 24904 |
+
{
|
| 24905 |
+
"epoch": 0.10538072750327371,
|
| 24906 |
+
"grad_norm": 6.84375,
|
| 24907 |
+
"learning_rate": 0.0004825559285000257,
|
| 24908 |
+
"loss": 16.5136,
|
| 24909 |
+
"step": 71140
|
| 24910 |
+
},
|
| 24911 |
+
{
|
| 24912 |
+
"epoch": 0.1054103537972021,
|
| 24913 |
+
"grad_norm": 6.8125,
|
| 24914 |
+
"learning_rate": 0.00048255098956501814,
|
| 24915 |
+
"loss": 16.4724,
|
| 24916 |
+
"step": 71160
|
| 24917 |
+
},
|
| 24918 |
+
{
|
| 24919 |
+
"epoch": 0.10543998009113048,
|
| 24920 |
+
"grad_norm": 6.25,
|
| 24921 |
+
"learning_rate": 0.0004825460506300106,
|
| 24922 |
+
"loss": 16.4691,
|
| 24923 |
+
"step": 71180
|
| 24924 |
+
},
|
| 24925 |
+
{
|
| 24926 |
+
"epoch": 0.10546960638505887,
|
| 24927 |
+
"grad_norm": 7.96875,
|
| 24928 |
+
"learning_rate": 0.000482541111695003,
|
| 24929 |
+
"loss": 16.4072,
|
| 24930 |
+
"step": 71200
|
| 24931 |
+
},
|
| 24932 |
+
{
|
| 24933 |
+
"epoch": 0.10549923267898725,
|
| 24934 |
+
"grad_norm": 6.375,
|
| 24935 |
+
"learning_rate": 0.0004825361727599954,
|
| 24936 |
+
"loss": 16.531,
|
| 24937 |
+
"step": 71220
|
| 24938 |
+
},
|
| 24939 |
+
{
|
| 24940 |
+
"epoch": 0.10552885897291564,
|
| 24941 |
+
"grad_norm": 6.3125,
|
| 24942 |
+
"learning_rate": 0.0004825312338249879,
|
| 24943 |
+
"loss": 16.5211,
|
| 24944 |
+
"step": 71240
|
| 24945 |
+
},
|
| 24946 |
+
{
|
| 24947 |
+
"epoch": 0.10555848526684403,
|
| 24948 |
+
"grad_norm": 6.625,
|
| 24949 |
+
"learning_rate": 0.00048252629488998027,
|
| 24950 |
+
"loss": 16.5079,
|
| 24951 |
+
"step": 71260
|
| 24952 |
+
},
|
| 24953 |
+
{
|
| 24954 |
+
"epoch": 0.10558811156077241,
|
| 24955 |
+
"grad_norm": 6.5625,
|
| 24956 |
+
"learning_rate": 0.0004825213559549727,
|
| 24957 |
+
"loss": 16.4813,
|
| 24958 |
+
"step": 71280
|
| 24959 |
+
},
|
| 24960 |
+
{
|
| 24961 |
+
"epoch": 0.1056177378547008,
|
| 24962 |
+
"grad_norm": 7.5,
|
| 24963 |
+
"learning_rate": 0.00048251641701996516,
|
| 24964 |
+
"loss": 16.5194,
|
| 24965 |
+
"step": 71300
|
| 24966 |
+
},
|
| 24967 |
+
{
|
| 24968 |
+
"epoch": 0.10564736414862919,
|
| 24969 |
+
"grad_norm": 6.84375,
|
| 24970 |
+
"learning_rate": 0.0004825114780849576,
|
| 24971 |
+
"loss": 16.4672,
|
| 24972 |
+
"step": 71320
|
| 24973 |
+
},
|
| 24974 |
+
{
|
| 24975 |
+
"epoch": 0.10567699044255757,
|
| 24976 |
+
"grad_norm": 6.90625,
|
| 24977 |
+
"learning_rate": 0.00048250653914995,
|
| 24978 |
+
"loss": 16.5088,
|
| 24979 |
+
"step": 71340
|
| 24980 |
+
},
|
| 24981 |
+
{
|
| 24982 |
+
"epoch": 0.10570661673648597,
|
| 24983 |
+
"grad_norm": 7.53125,
|
| 24984 |
+
"learning_rate": 0.00048250160021494245,
|
| 24985 |
+
"loss": 16.5076,
|
| 24986 |
+
"step": 71360
|
| 24987 |
+
},
|
| 24988 |
+
{
|
| 24989 |
+
"epoch": 0.10573624303041436,
|
| 24990 |
+
"grad_norm": 5.65625,
|
| 24991 |
+
"learning_rate": 0.0004824966612799349,
|
| 24992 |
+
"loss": 16.4723,
|
| 24993 |
+
"step": 71380
|
| 24994 |
+
},
|
| 24995 |
+
{
|
| 24996 |
+
"epoch": 0.10576586932434275,
|
| 24997 |
+
"grad_norm": 6.3125,
|
| 24998 |
+
"learning_rate": 0.00048249172234492735,
|
| 24999 |
+
"loss": 16.4759,
|
| 25000 |
+
"step": 71400
|
| 25001 |
+
},
|
| 25002 |
+
{
|
| 25003 |
+
"epoch": 0.10579549561827113,
|
| 25004 |
+
"grad_norm": 6.03125,
|
| 25005 |
+
"learning_rate": 0.00048248678340991974,
|
| 25006 |
+
"loss": 16.4789,
|
| 25007 |
+
"step": 71420
|
| 25008 |
+
},
|
| 25009 |
+
{
|
| 25010 |
+
"epoch": 0.10582512191219952,
|
| 25011 |
+
"grad_norm": 6.71875,
|
| 25012 |
+
"learning_rate": 0.0004824818444749122,
|
| 25013 |
+
"loss": 16.5194,
|
| 25014 |
+
"step": 71440
|
| 25015 |
+
},
|
| 25016 |
+
{
|
| 25017 |
+
"epoch": 0.1058547482061279,
|
| 25018 |
+
"grad_norm": 7.1875,
|
| 25019 |
+
"learning_rate": 0.00048247690553990464,
|
| 25020 |
+
"loss": 16.5426,
|
| 25021 |
+
"step": 71460
|
| 25022 |
+
},
|
| 25023 |
+
{
|
| 25024 |
+
"epoch": 0.10588437450005629,
|
| 25025 |
+
"grad_norm": 6.40625,
|
| 25026 |
+
"learning_rate": 0.0004824719666048971,
|
| 25027 |
+
"loss": 16.5093,
|
| 25028 |
+
"step": 71480
|
| 25029 |
+
},
|
| 25030 |
+
{
|
| 25031 |
+
"epoch": 0.10591400079398468,
|
| 25032 |
+
"grad_norm": 7.28125,
|
| 25033 |
+
"learning_rate": 0.0004824670276698895,
|
| 25034 |
+
"loss": 16.5204,
|
| 25035 |
+
"step": 71500
|
| 25036 |
+
},
|
| 25037 |
+
{
|
| 25038 |
+
"epoch": 0.10594362708791306,
|
| 25039 |
+
"grad_norm": 7.0,
|
| 25040 |
+
"learning_rate": 0.000482462088734882,
|
| 25041 |
+
"loss": 16.5197,
|
| 25042 |
+
"step": 71520
|
| 25043 |
+
},
|
| 25044 |
+
{
|
| 25045 |
+
"epoch": 0.10597325338184145,
|
| 25046 |
+
"grad_norm": 6.1875,
|
| 25047 |
+
"learning_rate": 0.0004824571497998744,
|
| 25048 |
+
"loss": 16.4954,
|
| 25049 |
+
"step": 71540
|
| 25050 |
+
},
|
| 25051 |
+
{
|
| 25052 |
+
"epoch": 0.10600287967576984,
|
| 25053 |
+
"grad_norm": 6.6875,
|
| 25054 |
+
"learning_rate": 0.00048245221086486677,
|
| 25055 |
+
"loss": 16.4379,
|
| 25056 |
+
"step": 71560
|
| 25057 |
+
},
|
| 25058 |
+
{
|
| 25059 |
+
"epoch": 0.10603250596969822,
|
| 25060 |
+
"grad_norm": 7.8125,
|
| 25061 |
+
"learning_rate": 0.0004824472719298592,
|
| 25062 |
+
"loss": 16.4314,
|
| 25063 |
+
"step": 71580
|
| 25064 |
+
},
|
| 25065 |
+
{
|
| 25066 |
+
"epoch": 0.10606213226362661,
|
| 25067 |
+
"grad_norm": 7.40625,
|
| 25068 |
+
"learning_rate": 0.00048244233299485166,
|
| 25069 |
+
"loss": 16.4561,
|
| 25070 |
+
"step": 71600
|
| 25071 |
+
},
|
| 25072 |
+
{
|
| 25073 |
+
"epoch": 0.106091758557555,
|
| 25074 |
+
"grad_norm": 6.625,
|
| 25075 |
+
"learning_rate": 0.0004824373940598441,
|
| 25076 |
+
"loss": 16.5224,
|
| 25077 |
+
"step": 71620
|
| 25078 |
+
},
|
| 25079 |
+
{
|
| 25080 |
+
"epoch": 0.10612138485148338,
|
| 25081 |
+
"grad_norm": 6.78125,
|
| 25082 |
+
"learning_rate": 0.0004824324551248365,
|
| 25083 |
+
"loss": 16.4732,
|
| 25084 |
+
"step": 71640
|
| 25085 |
+
},
|
| 25086 |
+
{
|
| 25087 |
+
"epoch": 0.10615101114541177,
|
| 25088 |
+
"grad_norm": 6.90625,
|
| 25089 |
+
"learning_rate": 0.00048242751618982895,
|
| 25090 |
+
"loss": 16.4654,
|
| 25091 |
+
"step": 71660
|
| 25092 |
+
},
|
| 25093 |
+
{
|
| 25094 |
+
"epoch": 0.10618063743934017,
|
| 25095 |
+
"grad_norm": 7.875,
|
| 25096 |
+
"learning_rate": 0.0004824225772548214,
|
| 25097 |
+
"loss": 16.5161,
|
| 25098 |
+
"step": 71680
|
| 25099 |
+
},
|
| 25100 |
+
{
|
| 25101 |
+
"epoch": 0.10621026373326856,
|
| 25102 |
+
"grad_norm": 6.09375,
|
| 25103 |
+
"learning_rate": 0.00048241763831981385,
|
| 25104 |
+
"loss": 16.446,
|
| 25105 |
+
"step": 71700
|
| 25106 |
+
},
|
| 25107 |
+
{
|
| 25108 |
+
"epoch": 0.10623989002719694,
|
| 25109 |
+
"grad_norm": 7.1875,
|
| 25110 |
+
"learning_rate": 0.00048241269938480624,
|
| 25111 |
+
"loss": 16.5245,
|
| 25112 |
+
"step": 71720
|
| 25113 |
+
},
|
| 25114 |
+
{
|
| 25115 |
+
"epoch": 0.10626951632112533,
|
| 25116 |
+
"grad_norm": 6.1875,
|
| 25117 |
+
"learning_rate": 0.0004824077604497987,
|
| 25118 |
+
"loss": 16.4989,
|
| 25119 |
+
"step": 71740
|
| 25120 |
+
},
|
| 25121 |
+
{
|
| 25122 |
+
"epoch": 0.10629914261505372,
|
| 25123 |
+
"grad_norm": 7.0,
|
| 25124 |
+
"learning_rate": 0.00048240282151479114,
|
| 25125 |
+
"loss": 16.5321,
|
| 25126 |
+
"step": 71760
|
| 25127 |
+
},
|
| 25128 |
+
{
|
| 25129 |
+
"epoch": 0.1063287689089821,
|
| 25130 |
+
"grad_norm": 6.15625,
|
| 25131 |
+
"learning_rate": 0.0004823978825797836,
|
| 25132 |
+
"loss": 16.4498,
|
| 25133 |
+
"step": 71780
|
| 25134 |
+
},
|
| 25135 |
+
{
|
| 25136 |
+
"epoch": 0.10635839520291049,
|
| 25137 |
+
"grad_norm": 6.875,
|
| 25138 |
+
"learning_rate": 0.000482392943644776,
|
| 25139 |
+
"loss": 16.4793,
|
| 25140 |
+
"step": 71800
|
| 25141 |
+
},
|
| 25142 |
+
{
|
| 25143 |
+
"epoch": 0.10638802149683887,
|
| 25144 |
+
"grad_norm": 7.90625,
|
| 25145 |
+
"learning_rate": 0.0004823880047097685,
|
| 25146 |
+
"loss": 16.4312,
|
| 25147 |
+
"step": 71820
|
| 25148 |
+
},
|
| 25149 |
+
{
|
| 25150 |
+
"epoch": 0.10641764779076726,
|
| 25151 |
+
"grad_norm": 6.625,
|
| 25152 |
+
"learning_rate": 0.0004823830657747609,
|
| 25153 |
+
"loss": 16.5154,
|
| 25154 |
+
"step": 71840
|
| 25155 |
+
},
|
| 25156 |
+
{
|
| 25157 |
+
"epoch": 0.10644727408469565,
|
| 25158 |
+
"grad_norm": 7.5,
|
| 25159 |
+
"learning_rate": 0.00048237812683975327,
|
| 25160 |
+
"loss": 16.4446,
|
| 25161 |
+
"step": 71860
|
| 25162 |
+
},
|
| 25163 |
+
{
|
| 25164 |
+
"epoch": 0.10647690037862403,
|
| 25165 |
+
"grad_norm": 7.25,
|
| 25166 |
+
"learning_rate": 0.0004823731879047457,
|
| 25167 |
+
"loss": 16.5021,
|
| 25168 |
+
"step": 71880
|
| 25169 |
+
},
|
| 25170 |
+
{
|
| 25171 |
+
"epoch": 0.10650652667255242,
|
| 25172 |
+
"grad_norm": 6.5,
|
| 25173 |
+
"learning_rate": 0.00048236824896973816,
|
| 25174 |
+
"loss": 16.4583,
|
| 25175 |
+
"step": 71900
|
| 25176 |
+
},
|
| 25177 |
+
{
|
| 25178 |
+
"epoch": 0.10653615296648081,
|
| 25179 |
+
"grad_norm": 6.625,
|
| 25180 |
+
"learning_rate": 0.0004823633100347306,
|
| 25181 |
+
"loss": 16.4529,
|
| 25182 |
+
"step": 71920
|
| 25183 |
+
},
|
| 25184 |
+
{
|
| 25185 |
+
"epoch": 0.1065657792604092,
|
| 25186 |
+
"grad_norm": 6.15625,
|
| 25187 |
+
"learning_rate": 0.000482358371099723,
|
| 25188 |
+
"loss": 16.4529,
|
| 25189 |
+
"step": 71940
|
| 25190 |
+
},
|
| 25191 |
+
{
|
| 25192 |
+
"epoch": 0.10659540555433758,
|
| 25193 |
+
"grad_norm": 6.53125,
|
| 25194 |
+
"learning_rate": 0.00048235343216471545,
|
| 25195 |
+
"loss": 16.4638,
|
| 25196 |
+
"step": 71960
|
| 25197 |
+
},
|
| 25198 |
+
{
|
| 25199 |
+
"epoch": 0.10662503184826597,
|
| 25200 |
+
"grad_norm": 6.5,
|
| 25201 |
+
"learning_rate": 0.0004823484932297079,
|
| 25202 |
+
"loss": 16.4739,
|
| 25203 |
+
"step": 71980
|
| 25204 |
+
},
|
| 25205 |
+
{
|
| 25206 |
+
"epoch": 0.10665465814219437,
|
| 25207 |
+
"grad_norm": 6.75,
|
| 25208 |
+
"learning_rate": 0.00048234355429470035,
|
| 25209 |
+
"loss": 16.5261,
|
| 25210 |
+
"step": 72000
|
| 25211 |
}
|
| 25212 |
],
|
| 25213 |
"logging_steps": 20,
|
|
|
|
| 25227 |
"attributes": {}
|
| 25228 |
}
|
| 25229 |
},
|
| 25230 |
+
"total_flos": 5.293707639198528e+19,
|
| 25231 |
"train_batch_size": 48,
|
| 25232 |
"trial_name": null,
|
| 25233 |
"trial_params": null
|