Training in progress, step 10500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4cc6d2ac14b136a0c5c39d3842c8290195765d0231c31019222880ab2ada323a
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6c4e658acbdc5e0bc6eda245ab297a40c16a3c1814b13d63c1d7cae82962a95
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f1b6e95985cf829ad61f7f680a73f323339cc556ff96e0fd4cb8e86a2237898
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9008,6 +9008,456 @@
|
|
| 9008 |
"mean_token_accuracy": 0.801843786239624,
|
| 9009 |
"num_tokens": 11076275.0,
|
| 9010 |
"step": 10000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9011 |
}
|
| 9012 |
],
|
| 9013 |
"logging_steps": 10,
|
|
@@ -9027,7 +9477,7 @@
|
|
| 9027 |
"attributes": {}
|
| 9028 |
}
|
| 9029 |
},
|
| 9030 |
-
"total_flos": 1.
|
| 9031 |
"train_batch_size": 8,
|
| 9032 |
"trial_name": null,
|
| 9033 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.1156558533145273,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 10500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9008 |
"mean_token_accuracy": 0.801843786239624,
|
| 9009 |
"num_tokens": 11076275.0,
|
| 9010 |
"step": 10000
|
| 9011 |
+
},
|
| 9012 |
+
{
|
| 9013 |
+
"epoch": 2.016925246826516,
|
| 9014 |
+
"grad_norm": 12.0625,
|
| 9015 |
+
"learning_rate": 6.555174961380886e-06,
|
| 9016 |
+
"loss": 0.7623,
|
| 9017 |
+
"mean_token_accuracy": 0.809873354434967,
|
| 9018 |
+
"num_tokens": 11087022.0,
|
| 9019 |
+
"step": 10010
|
| 9020 |
+
},
|
| 9021 |
+
{
|
| 9022 |
+
"epoch": 2.0189401571630063,
|
| 9023 |
+
"grad_norm": 11.0625,
|
| 9024 |
+
"learning_rate": 6.5417422258042855e-06,
|
| 9025 |
+
"loss": 0.9055,
|
| 9026 |
+
"mean_token_accuracy": 0.7793718695640564,
|
| 9027 |
+
"num_tokens": 11098027.0,
|
| 9028 |
+
"step": 10020
|
| 9029 |
+
},
|
| 9030 |
+
{
|
| 9031 |
+
"epoch": 2.0209550674994965,
|
| 9032 |
+
"grad_norm": 12.1875,
|
| 9033 |
+
"learning_rate": 6.528309490227685e-06,
|
| 9034 |
+
"loss": 0.7758,
|
| 9035 |
+
"mean_token_accuracy": 0.8036232054233551,
|
| 9036 |
+
"num_tokens": 11110290.0,
|
| 9037 |
+
"step": 10030
|
| 9038 |
+
},
|
| 9039 |
+
{
|
| 9040 |
+
"epoch": 2.022969977835986,
|
| 9041 |
+
"grad_norm": 11.6875,
|
| 9042 |
+
"learning_rate": 6.514876754651085e-06,
|
| 9043 |
+
"loss": 0.7201,
|
| 9044 |
+
"mean_token_accuracy": 0.8190805375576019,
|
| 9045 |
+
"num_tokens": 11120919.0,
|
| 9046 |
+
"step": 10040
|
| 9047 |
+
},
|
| 9048 |
+
{
|
| 9049 |
+
"epoch": 2.0249848881724763,
|
| 9050 |
+
"grad_norm": 12.5,
|
| 9051 |
+
"learning_rate": 6.501444019074485e-06,
|
| 9052 |
+
"loss": 0.9213,
|
| 9053 |
+
"mean_token_accuracy": 0.7752482295036316,
|
| 9054 |
+
"num_tokens": 11132545.0,
|
| 9055 |
+
"step": 10050
|
| 9056 |
+
},
|
| 9057 |
+
{
|
| 9058 |
+
"epoch": 2.0269997985089665,
|
| 9059 |
+
"grad_norm": 12.6875,
|
| 9060 |
+
"learning_rate": 6.488011283497885e-06,
|
| 9061 |
+
"loss": 0.8841,
|
| 9062 |
+
"mean_token_accuracy": 0.7821820974349976,
|
| 9063 |
+
"num_tokens": 11143351.0,
|
| 9064 |
+
"step": 10060
|
| 9065 |
+
},
|
| 9066 |
+
{
|
| 9067 |
+
"epoch": 2.029014708845456,
|
| 9068 |
+
"grad_norm": 12.1875,
|
| 9069 |
+
"learning_rate": 6.474578547921284e-06,
|
| 9070 |
+
"loss": 0.7844,
|
| 9071 |
+
"mean_token_accuracy": 0.7987569034099579,
|
| 9072 |
+
"num_tokens": 11153881.0,
|
| 9073 |
+
"step": 10070
|
| 9074 |
+
},
|
| 9075 |
+
{
|
| 9076 |
+
"epoch": 2.0310296191819464,
|
| 9077 |
+
"grad_norm": 11.75,
|
| 9078 |
+
"learning_rate": 6.461145812344684e-06,
|
| 9079 |
+
"loss": 0.8333,
|
| 9080 |
+
"mean_token_accuracy": 0.790239280462265,
|
| 9081 |
+
"num_tokens": 11164316.0,
|
| 9082 |
+
"step": 10080
|
| 9083 |
+
},
|
| 9084 |
+
{
|
| 9085 |
+
"epoch": 2.0330445295184365,
|
| 9086 |
+
"grad_norm": 10.0,
|
| 9087 |
+
"learning_rate": 6.447713076768084e-06,
|
| 9088 |
+
"loss": 0.7952,
|
| 9089 |
+
"mean_token_accuracy": 0.798451715707779,
|
| 9090 |
+
"num_tokens": 11175390.0,
|
| 9091 |
+
"step": 10090
|
| 9092 |
+
},
|
| 9093 |
+
{
|
| 9094 |
+
"epoch": 2.0350594398549267,
|
| 9095 |
+
"grad_norm": 11.125,
|
| 9096 |
+
"learning_rate": 6.434280341191485e-06,
|
| 9097 |
+
"loss": 0.866,
|
| 9098 |
+
"mean_token_accuracy": 0.7892335176467895,
|
| 9099 |
+
"num_tokens": 11187303.0,
|
| 9100 |
+
"step": 10100
|
| 9101 |
+
},
|
| 9102 |
+
{
|
| 9103 |
+
"epoch": 2.0370743501914164,
|
| 9104 |
+
"grad_norm": 12.875,
|
| 9105 |
+
"learning_rate": 6.4208476056148835e-06,
|
| 9106 |
+
"loss": 0.8048,
|
| 9107 |
+
"mean_token_accuracy": 0.796100115776062,
|
| 9108 |
+
"num_tokens": 11198006.0,
|
| 9109 |
+
"step": 10110
|
| 9110 |
+
},
|
| 9111 |
+
{
|
| 9112 |
+
"epoch": 2.0390892605279065,
|
| 9113 |
+
"grad_norm": 11.8125,
|
| 9114 |
+
"learning_rate": 6.407414870038284e-06,
|
| 9115 |
+
"loss": 0.8066,
|
| 9116 |
+
"mean_token_accuracy": 0.796057403087616,
|
| 9117 |
+
"num_tokens": 11207940.0,
|
| 9118 |
+
"step": 10120
|
| 9119 |
+
},
|
| 9120 |
+
{
|
| 9121 |
+
"epoch": 2.0411041708643967,
|
| 9122 |
+
"grad_norm": 12.25,
|
| 9123 |
+
"learning_rate": 6.393982134461684e-06,
|
| 9124 |
+
"loss": 0.7275,
|
| 9125 |
+
"mean_token_accuracy": 0.818196564912796,
|
| 9126 |
+
"num_tokens": 11218927.0,
|
| 9127 |
+
"step": 10130
|
| 9128 |
+
},
|
| 9129 |
+
{
|
| 9130 |
+
"epoch": 2.0431190812008864,
|
| 9131 |
+
"grad_norm": 12.5,
|
| 9132 |
+
"learning_rate": 6.380549398885083e-06,
|
| 9133 |
+
"loss": 1.0617,
|
| 9134 |
+
"mean_token_accuracy": 0.755308473110199,
|
| 9135 |
+
"num_tokens": 11230139.0,
|
| 9136 |
+
"step": 10140
|
| 9137 |
+
},
|
| 9138 |
+
{
|
| 9139 |
+
"epoch": 2.0451339915373765,
|
| 9140 |
+
"grad_norm": 9.75,
|
| 9141 |
+
"learning_rate": 6.367116663308484e-06,
|
| 9142 |
+
"loss": 0.793,
|
| 9143 |
+
"mean_token_accuracy": 0.8068155586719513,
|
| 9144 |
+
"num_tokens": 11241453.0,
|
| 9145 |
+
"step": 10150
|
| 9146 |
+
},
|
| 9147 |
+
{
|
| 9148 |
+
"epoch": 2.0471489018738667,
|
| 9149 |
+
"grad_norm": 9.3125,
|
| 9150 |
+
"learning_rate": 6.353683927731883e-06,
|
| 9151 |
+
"loss": 0.9479,
|
| 9152 |
+
"mean_token_accuracy": 0.7669933021068573,
|
| 9153 |
+
"num_tokens": 11253740.0,
|
| 9154 |
+
"step": 10160
|
| 9155 |
+
},
|
| 9156 |
+
{
|
| 9157 |
+
"epoch": 2.049163812210357,
|
| 9158 |
+
"grad_norm": 9.3125,
|
| 9159 |
+
"learning_rate": 6.340251192155283e-06,
|
| 9160 |
+
"loss": 0.6941,
|
| 9161 |
+
"mean_token_accuracy": 0.8190494358539582,
|
| 9162 |
+
"num_tokens": 11266238.0,
|
| 9163 |
+
"step": 10170
|
| 9164 |
+
},
|
| 9165 |
+
{
|
| 9166 |
+
"epoch": 2.0511787225468465,
|
| 9167 |
+
"grad_norm": 10.9375,
|
| 9168 |
+
"learning_rate": 6.326818456578683e-06,
|
| 9169 |
+
"loss": 0.7265,
|
| 9170 |
+
"mean_token_accuracy": 0.8130114257335663,
|
| 9171 |
+
"num_tokens": 11277298.0,
|
| 9172 |
+
"step": 10180
|
| 9173 |
+
},
|
| 9174 |
+
{
|
| 9175 |
+
"epoch": 2.0531936328833367,
|
| 9176 |
+
"grad_norm": 10.5625,
|
| 9177 |
+
"learning_rate": 6.313385721002082e-06,
|
| 9178 |
+
"loss": 0.7497,
|
| 9179 |
+
"mean_token_accuracy": 0.8102090239524842,
|
| 9180 |
+
"num_tokens": 11289134.0,
|
| 9181 |
+
"step": 10190
|
| 9182 |
+
},
|
| 9183 |
+
{
|
| 9184 |
+
"epoch": 2.055208543219827,
|
| 9185 |
+
"grad_norm": 12.875,
|
| 9186 |
+
"learning_rate": 6.299952985425482e-06,
|
| 9187 |
+
"loss": 0.823,
|
| 9188 |
+
"mean_token_accuracy": 0.7950894236564636,
|
| 9189 |
+
"num_tokens": 11299969.0,
|
| 9190 |
+
"step": 10200
|
| 9191 |
+
},
|
| 9192 |
+
{
|
| 9193 |
+
"epoch": 2.0572234535563165,
|
| 9194 |
+
"grad_norm": 11.75,
|
| 9195 |
+
"learning_rate": 6.286520249848882e-06,
|
| 9196 |
+
"loss": 0.7238,
|
| 9197 |
+
"mean_token_accuracy": 0.8222428441047669,
|
| 9198 |
+
"num_tokens": 11311491.0,
|
| 9199 |
+
"step": 10210
|
| 9200 |
+
},
|
| 9201 |
+
{
|
| 9202 |
+
"epoch": 2.0592383638928067,
|
| 9203 |
+
"grad_norm": 13.4375,
|
| 9204 |
+
"learning_rate": 6.2730875142722825e-06,
|
| 9205 |
+
"loss": 0.8528,
|
| 9206 |
+
"mean_token_accuracy": 0.7913759410381317,
|
| 9207 |
+
"num_tokens": 11321862.0,
|
| 9208 |
+
"step": 10220
|
| 9209 |
+
},
|
| 9210 |
+
{
|
| 9211 |
+
"epoch": 2.061253274229297,
|
| 9212 |
+
"grad_norm": 10.6875,
|
| 9213 |
+
"learning_rate": 6.259654778695682e-06,
|
| 9214 |
+
"loss": 0.7371,
|
| 9215 |
+
"mean_token_accuracy": 0.8153777897357941,
|
| 9216 |
+
"num_tokens": 11333343.0,
|
| 9217 |
+
"step": 10230
|
| 9218 |
+
},
|
| 9219 |
+
{
|
| 9220 |
+
"epoch": 2.063268184565787,
|
| 9221 |
+
"grad_norm": 11.0,
|
| 9222 |
+
"learning_rate": 6.246222043119081e-06,
|
| 9223 |
+
"loss": 0.8531,
|
| 9224 |
+
"mean_token_accuracy": 0.7924916267395019,
|
| 9225 |
+
"num_tokens": 11345705.0,
|
| 9226 |
+
"step": 10240
|
| 9227 |
+
},
|
| 9228 |
+
{
|
| 9229 |
+
"epoch": 2.0652830949022767,
|
| 9230 |
+
"grad_norm": 11.125,
|
| 9231 |
+
"learning_rate": 6.2327893075424815e-06,
|
| 9232 |
+
"loss": 0.7911,
|
| 9233 |
+
"mean_token_accuracy": 0.8001395165920258,
|
| 9234 |
+
"num_tokens": 11356265.0,
|
| 9235 |
+
"step": 10250
|
| 9236 |
+
},
|
| 9237 |
+
{
|
| 9238 |
+
"epoch": 2.067298005238767,
|
| 9239 |
+
"grad_norm": 11.5625,
|
| 9240 |
+
"learning_rate": 6.219356571965881e-06,
|
| 9241 |
+
"loss": 0.7311,
|
| 9242 |
+
"mean_token_accuracy": 0.8132711887359619,
|
| 9243 |
+
"num_tokens": 11366725.0,
|
| 9244 |
+
"step": 10260
|
| 9245 |
+
},
|
| 9246 |
+
{
|
| 9247 |
+
"epoch": 2.069312915575257,
|
| 9248 |
+
"grad_norm": 11.4375,
|
| 9249 |
+
"learning_rate": 6.205923836389282e-06,
|
| 9250 |
+
"loss": 0.8873,
|
| 9251 |
+
"mean_token_accuracy": 0.7833378136157989,
|
| 9252 |
+
"num_tokens": 11377435.0,
|
| 9253 |
+
"step": 10270
|
| 9254 |
+
},
|
| 9255 |
+
{
|
| 9256 |
+
"epoch": 2.071327825911747,
|
| 9257 |
+
"grad_norm": 10.5625,
|
| 9258 |
+
"learning_rate": 6.192491100812681e-06,
|
| 9259 |
+
"loss": 0.802,
|
| 9260 |
+
"mean_token_accuracy": 0.805024367570877,
|
| 9261 |
+
"num_tokens": 11388919.0,
|
| 9262 |
+
"step": 10280
|
| 9263 |
+
},
|
| 9264 |
+
{
|
| 9265 |
+
"epoch": 2.073342736248237,
|
| 9266 |
+
"grad_norm": 9.9375,
|
| 9267 |
+
"learning_rate": 6.179058365236081e-06,
|
| 9268 |
+
"loss": 0.782,
|
| 9269 |
+
"mean_token_accuracy": 0.8056276500225067,
|
| 9270 |
+
"num_tokens": 11399842.0,
|
| 9271 |
+
"step": 10290
|
| 9272 |
+
},
|
| 9273 |
+
{
|
| 9274 |
+
"epoch": 2.075357646584727,
|
| 9275 |
+
"grad_norm": 12.125,
|
| 9276 |
+
"learning_rate": 6.165625629659481e-06,
|
| 9277 |
+
"loss": 0.7425,
|
| 9278 |
+
"mean_token_accuracy": 0.8137720346450805,
|
| 9279 |
+
"num_tokens": 11410189.0,
|
| 9280 |
+
"step": 10300
|
| 9281 |
+
},
|
| 9282 |
+
{
|
| 9283 |
+
"epoch": 2.077372556921217,
|
| 9284 |
+
"grad_norm": 13.8125,
|
| 9285 |
+
"learning_rate": 6.1521928940828805e-06,
|
| 9286 |
+
"loss": 0.7876,
|
| 9287 |
+
"mean_token_accuracy": 0.8061869978904724,
|
| 9288 |
+
"num_tokens": 11419892.0,
|
| 9289 |
+
"step": 10310
|
| 9290 |
+
},
|
| 9291 |
+
{
|
| 9292 |
+
"epoch": 2.079387467257707,
|
| 9293 |
+
"grad_norm": 11.125,
|
| 9294 |
+
"learning_rate": 6.138760158506281e-06,
|
| 9295 |
+
"loss": 0.8218,
|
| 9296 |
+
"mean_token_accuracy": 0.7978219330310822,
|
| 9297 |
+
"num_tokens": 11432030.0,
|
| 9298 |
+
"step": 10320
|
| 9299 |
+
},
|
| 9300 |
+
{
|
| 9301 |
+
"epoch": 2.081402377594197,
|
| 9302 |
+
"grad_norm": 12.125,
|
| 9303 |
+
"learning_rate": 6.12532742292968e-06,
|
| 9304 |
+
"loss": 0.9129,
|
| 9305 |
+
"mean_token_accuracy": 0.7785973668098449,
|
| 9306 |
+
"num_tokens": 11443216.0,
|
| 9307 |
+
"step": 10330
|
| 9308 |
+
},
|
| 9309 |
+
{
|
| 9310 |
+
"epoch": 2.083417287930687,
|
| 9311 |
+
"grad_norm": 14.25,
|
| 9312 |
+
"learning_rate": 6.11189468735308e-06,
|
| 9313 |
+
"loss": 0.8782,
|
| 9314 |
+
"mean_token_accuracy": 0.7902339398860931,
|
| 9315 |
+
"num_tokens": 11454439.0,
|
| 9316 |
+
"step": 10340
|
| 9317 |
+
},
|
| 9318 |
+
{
|
| 9319 |
+
"epoch": 2.0854321982671773,
|
| 9320 |
+
"grad_norm": 13.1875,
|
| 9321 |
+
"learning_rate": 6.09846195177648e-06,
|
| 9322 |
+
"loss": 0.7751,
|
| 9323 |
+
"mean_token_accuracy": 0.8015202224254608,
|
| 9324 |
+
"num_tokens": 11465006.0,
|
| 9325 |
+
"step": 10350
|
| 9326 |
+
},
|
| 9327 |
+
{
|
| 9328 |
+
"epoch": 2.087447108603667,
|
| 9329 |
+
"grad_norm": 10.9375,
|
| 9330 |
+
"learning_rate": 6.085029216199879e-06,
|
| 9331 |
+
"loss": 0.8718,
|
| 9332 |
+
"mean_token_accuracy": 0.788787704706192,
|
| 9333 |
+
"num_tokens": 11474763.0,
|
| 9334 |
+
"step": 10360
|
| 9335 |
+
},
|
| 9336 |
+
{
|
| 9337 |
+
"epoch": 2.089462018940157,
|
| 9338 |
+
"grad_norm": 11.875,
|
| 9339 |
+
"learning_rate": 6.071596480623279e-06,
|
| 9340 |
+
"loss": 0.8359,
|
| 9341 |
+
"mean_token_accuracy": 0.7963649094104767,
|
| 9342 |
+
"num_tokens": 11485808.0,
|
| 9343 |
+
"step": 10370
|
| 9344 |
+
},
|
| 9345 |
+
{
|
| 9346 |
+
"epoch": 2.0914769292766473,
|
| 9347 |
+
"grad_norm": 11.4375,
|
| 9348 |
+
"learning_rate": 6.058163745046679e-06,
|
| 9349 |
+
"loss": 0.7756,
|
| 9350 |
+
"mean_token_accuracy": 0.8061880767345428,
|
| 9351 |
+
"num_tokens": 11497158.0,
|
| 9352 |
+
"step": 10380
|
| 9353 |
+
},
|
| 9354 |
+
{
|
| 9355 |
+
"epoch": 2.093491839613137,
|
| 9356 |
+
"grad_norm": 12.1875,
|
| 9357 |
+
"learning_rate": 6.04473100947008e-06,
|
| 9358 |
+
"loss": 0.946,
|
| 9359 |
+
"mean_token_accuracy": 0.7709095120429993,
|
| 9360 |
+
"num_tokens": 11507759.0,
|
| 9361 |
+
"step": 10390
|
| 9362 |
+
},
|
| 9363 |
+
{
|
| 9364 |
+
"epoch": 2.095506749949627,
|
| 9365 |
+
"grad_norm": 10.5625,
|
| 9366 |
+
"learning_rate": 6.0312982738934785e-06,
|
| 9367 |
+
"loss": 0.7665,
|
| 9368 |
+
"mean_token_accuracy": 0.8077448666095733,
|
| 9369 |
+
"num_tokens": 11518936.0,
|
| 9370 |
+
"step": 10400
|
| 9371 |
+
},
|
| 9372 |
+
{
|
| 9373 |
+
"epoch": 2.0975216602861173,
|
| 9374 |
+
"grad_norm": 12.125,
|
| 9375 |
+
"learning_rate": 6.017865538316878e-06,
|
| 9376 |
+
"loss": 0.8428,
|
| 9377 |
+
"mean_token_accuracy": 0.7939082264900208,
|
| 9378 |
+
"num_tokens": 11529031.0,
|
| 9379 |
+
"step": 10410
|
| 9380 |
+
},
|
| 9381 |
+
{
|
| 9382 |
+
"epoch": 2.0995365706226075,
|
| 9383 |
+
"grad_norm": 12.4375,
|
| 9384 |
+
"learning_rate": 6.004432802740279e-06,
|
| 9385 |
+
"loss": 0.8232,
|
| 9386 |
+
"mean_token_accuracy": 0.8003376543521881,
|
| 9387 |
+
"num_tokens": 11541092.0,
|
| 9388 |
+
"step": 10420
|
| 9389 |
+
},
|
| 9390 |
+
{
|
| 9391 |
+
"epoch": 2.101551480959097,
|
| 9392 |
+
"grad_norm": 13.1875,
|
| 9393 |
+
"learning_rate": 5.991000067163678e-06,
|
| 9394 |
+
"loss": 0.7565,
|
| 9395 |
+
"mean_token_accuracy": 0.8153795897960663,
|
| 9396 |
+
"num_tokens": 11550995.0,
|
| 9397 |
+
"step": 10430
|
| 9398 |
+
},
|
| 9399 |
+
{
|
| 9400 |
+
"epoch": 2.1035663912955873,
|
| 9401 |
+
"grad_norm": 11.1875,
|
| 9402 |
+
"learning_rate": 5.977567331587079e-06,
|
| 9403 |
+
"loss": 0.7814,
|
| 9404 |
+
"mean_token_accuracy": 0.8018035531044007,
|
| 9405 |
+
"num_tokens": 11560893.0,
|
| 9406 |
+
"step": 10440
|
| 9407 |
+
},
|
| 9408 |
+
{
|
| 9409 |
+
"epoch": 2.1055813016320775,
|
| 9410 |
+
"grad_norm": 13.125,
|
| 9411 |
+
"learning_rate": 5.964134596010478e-06,
|
| 9412 |
+
"loss": 0.8012,
|
| 9413 |
+
"mean_token_accuracy": 0.7951594650745392,
|
| 9414 |
+
"num_tokens": 11571484.0,
|
| 9415 |
+
"step": 10450
|
| 9416 |
+
},
|
| 9417 |
+
{
|
| 9418 |
+
"epoch": 2.1075962119685676,
|
| 9419 |
+
"grad_norm": 11.5,
|
| 9420 |
+
"learning_rate": 5.950701860433877e-06,
|
| 9421 |
+
"loss": 0.8872,
|
| 9422 |
+
"mean_token_accuracy": 0.7848295509815216,
|
| 9423 |
+
"num_tokens": 11581689.0,
|
| 9424 |
+
"step": 10460
|
| 9425 |
+
},
|
| 9426 |
+
{
|
| 9427 |
+
"epoch": 2.1096111223050573,
|
| 9428 |
+
"grad_norm": 11.4375,
|
| 9429 |
+
"learning_rate": 5.937269124857278e-06,
|
| 9430 |
+
"loss": 0.7238,
|
| 9431 |
+
"mean_token_accuracy": 0.8154263854026794,
|
| 9432 |
+
"num_tokens": 11591503.0,
|
| 9433 |
+
"step": 10470
|
| 9434 |
+
},
|
| 9435 |
+
{
|
| 9436 |
+
"epoch": 2.1116260326415475,
|
| 9437 |
+
"grad_norm": 11.25,
|
| 9438 |
+
"learning_rate": 5.923836389280677e-06,
|
| 9439 |
+
"loss": 0.8675,
|
| 9440 |
+
"mean_token_accuracy": 0.7881495654582977,
|
| 9441 |
+
"num_tokens": 11602668.0,
|
| 9442 |
+
"step": 10480
|
| 9443 |
+
},
|
| 9444 |
+
{
|
| 9445 |
+
"epoch": 2.1136409429780376,
|
| 9446 |
+
"grad_norm": 14.25,
|
| 9447 |
+
"learning_rate": 5.910403653704077e-06,
|
| 9448 |
+
"loss": 0.7428,
|
| 9449 |
+
"mean_token_accuracy": 0.8135782480239868,
|
| 9450 |
+
"num_tokens": 11613507.0,
|
| 9451 |
+
"step": 10490
|
| 9452 |
+
},
|
| 9453 |
+
{
|
| 9454 |
+
"epoch": 2.1156558533145273,
|
| 9455 |
+
"grad_norm": 10.5,
|
| 9456 |
+
"learning_rate": 5.896970918127477e-06,
|
| 9457 |
+
"loss": 0.7815,
|
| 9458 |
+
"mean_token_accuracy": 0.807522964477539,
|
| 9459 |
+
"num_tokens": 11623915.0,
|
| 9460 |
+
"step": 10500
|
| 9461 |
}
|
| 9462 |
],
|
| 9463 |
"logging_steps": 10,
|
|
|
|
| 9477 |
"attributes": {}
|
| 9478 |
}
|
| 9479 |
},
|
| 9480 |
+
"total_flos": 1.4062792370479104e+16,
|
| 9481 |
"train_batch_size": 8,
|
| 9482 |
"trial_name": null,
|
| 9483 |
"trial_params": null
|