Training in progress, step 9500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:070b5b7acfb870eafcd0bf40ce133115da39bd3236dee84b8493ea73e863aebf
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768663315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6986d123949d70dafc8db16862d29980777537d8be6a72c449522a071032d5c
|
| 3 |
size 4768663315
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40beb3dc5129ab4ac6babe96012ebdd87569ab488ea6742096d9d349a8d4cd73
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8108,6 +8108,456 @@
|
|
| 8108 |
"mean_token_accuracy": 0.7797822907567025,
|
| 8109 |
"num_tokens": 73725952.0,
|
| 8110 |
"step": 9000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8111 |
}
|
| 8112 |
],
|
| 8113 |
"logging_steps": 10,
|
|
@@ -8127,7 +8577,7 @@
|
|
| 8127 |
"attributes": {}
|
| 8128 |
}
|
| 8129 |
},
|
| 8130 |
-
"total_flos":
|
| 8131 |
"train_batch_size": 2,
|
| 8132 |
"trial_name": null,
|
| 8133 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.6904804049987607,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 9500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8108 |
"mean_token_accuracy": 0.7797822907567025,
|
| 8109 |
"num_tokens": 73725952.0,
|
| 8110 |
"step": 9000
|
| 8111 |
+
},
|
| 8112 |
+
{
|
| 8113 |
+
"epoch": 2.5517046057988457,
|
| 8114 |
+
"grad_norm": 1.1519678831100464,
|
| 8115 |
+
"learning_rate": 1.661596559320256e-06,
|
| 8116 |
+
"loss": 0.1289,
|
| 8117 |
+
"mean_token_accuracy": 0.764640410989523,
|
| 8118 |
+
"num_tokens": 73807872.0,
|
| 8119 |
+
"step": 9010
|
| 8120 |
+
},
|
| 8121 |
+
{
|
| 8122 |
+
"epoch": 2.554536764966191,
|
| 8123 |
+
"grad_norm": 1.1929394006729126,
|
| 8124 |
+
"learning_rate": 1.65110668205182e-06,
|
| 8125 |
+
"loss": 0.1074,
|
| 8126 |
+
"mean_token_accuracy": 0.7785836592316627,
|
| 8127 |
+
"num_tokens": 73889792.0,
|
| 8128 |
+
"step": 9020
|
| 8129 |
+
},
|
| 8130 |
+
{
|
| 8131 |
+
"epoch": 2.5573689241335362,
|
| 8132 |
+
"grad_norm": 1.3088452816009521,
|
| 8133 |
+
"learning_rate": 1.6406168047833843e-06,
|
| 8134 |
+
"loss": 0.0986,
|
| 8135 |
+
"mean_token_accuracy": 0.7910469707101584,
|
| 8136 |
+
"num_tokens": 73971712.0,
|
| 8137 |
+
"step": 9030
|
| 8138 |
+
},
|
| 8139 |
+
{
|
| 8140 |
+
"epoch": 2.5602010833008815,
|
| 8141 |
+
"grad_norm": 1.795518159866333,
|
| 8142 |
+
"learning_rate": 1.6301269275149482e-06,
|
| 8143 |
+
"loss": 0.1081,
|
| 8144 |
+
"mean_token_accuracy": 0.7642245594412088,
|
| 8145 |
+
"num_tokens": 74053632.0,
|
| 8146 |
+
"step": 9040
|
| 8147 |
+
},
|
| 8148 |
+
{
|
| 8149 |
+
"epoch": 2.5630332424682267,
|
| 8150 |
+
"grad_norm": 1.3358420133590698,
|
| 8151 |
+
"learning_rate": 1.6196370502465123e-06,
|
| 8152 |
+
"loss": 0.1147,
|
| 8153 |
+
"mean_token_accuracy": 0.7747553832828998,
|
| 8154 |
+
"num_tokens": 74135552.0,
|
| 8155 |
+
"step": 9050
|
| 8156 |
+
},
|
| 8157 |
+
{
|
| 8158 |
+
"epoch": 2.565865401635572,
|
| 8159 |
+
"grad_norm": 1.489589810371399,
|
| 8160 |
+
"learning_rate": 1.6091471729780763e-06,
|
| 8161 |
+
"loss": 0.1207,
|
| 8162 |
+
"mean_token_accuracy": 0.7772504851222038,
|
| 8163 |
+
"num_tokens": 74217472.0,
|
| 8164 |
+
"step": 9060
|
| 8165 |
+
},
|
| 8166 |
+
{
|
| 8167 |
+
"epoch": 2.5686975608029172,
|
| 8168 |
+
"grad_norm": 1.3394817113876343,
|
| 8169 |
+
"learning_rate": 1.5986572957096402e-06,
|
| 8170 |
+
"loss": 0.1376,
|
| 8171 |
+
"mean_token_accuracy": 0.7540484316647053,
|
| 8172 |
+
"num_tokens": 74299392.0,
|
| 8173 |
+
"step": 9070
|
| 8174 |
+
},
|
| 8175 |
+
{
|
| 8176 |
+
"epoch": 2.5715297199702625,
|
| 8177 |
+
"grad_norm": 1.119963526725769,
|
| 8178 |
+
"learning_rate": 1.5881674184412043e-06,
|
| 8179 |
+
"loss": 0.1185,
|
| 8180 |
+
"mean_token_accuracy": 0.7621819950640202,
|
| 8181 |
+
"num_tokens": 74381312.0,
|
| 8182 |
+
"step": 9080
|
| 8183 |
+
},
|
| 8184 |
+
{
|
| 8185 |
+
"epoch": 2.5743618791376077,
|
| 8186 |
+
"grad_norm": 1.4001566171646118,
|
| 8187 |
+
"learning_rate": 1.5776775411727686e-06,
|
| 8188 |
+
"loss": 0.104,
|
| 8189 |
+
"mean_token_accuracy": 0.7811521515250206,
|
| 8190 |
+
"num_tokens": 74463232.0,
|
| 8191 |
+
"step": 9090
|
| 8192 |
+
},
|
| 8193 |
+
{
|
| 8194 |
+
"epoch": 2.5771940383049525,
|
| 8195 |
+
"grad_norm": 1.5772784948349,
|
| 8196 |
+
"learning_rate": 1.5671876639043324e-06,
|
| 8197 |
+
"loss": 0.1152,
|
| 8198 |
+
"mean_token_accuracy": 0.7650195706635714,
|
| 8199 |
+
"num_tokens": 74545152.0,
|
| 8200 |
+
"step": 9100
|
| 8201 |
+
},
|
| 8202 |
+
{
|
| 8203 |
+
"epoch": 2.5800261974722978,
|
| 8204 |
+
"grad_norm": 1.7766703367233276,
|
| 8205 |
+
"learning_rate": 1.5566977866358965e-06,
|
| 8206 |
+
"loss": 0.113,
|
| 8207 |
+
"mean_token_accuracy": 0.7801981404423713,
|
| 8208 |
+
"num_tokens": 74627072.0,
|
| 8209 |
+
"step": 9110
|
| 8210 |
+
},
|
| 8211 |
+
{
|
| 8212 |
+
"epoch": 2.582858356639643,
|
| 8213 |
+
"grad_norm": 1.4249588251113892,
|
| 8214 |
+
"learning_rate": 1.5462079093674606e-06,
|
| 8215 |
+
"loss": 0.1236,
|
| 8216 |
+
"mean_token_accuracy": 0.7641022481024266,
|
| 8217 |
+
"num_tokens": 74708992.0,
|
| 8218 |
+
"step": 9120
|
| 8219 |
+
},
|
| 8220 |
+
{
|
| 8221 |
+
"epoch": 2.5856905158069883,
|
| 8222 |
+
"grad_norm": 1.6609476804733276,
|
| 8223 |
+
"learning_rate": 1.5357180320990244e-06,
|
| 8224 |
+
"loss": 0.1154,
|
| 8225 |
+
"mean_token_accuracy": 0.763050389662385,
|
| 8226 |
+
"num_tokens": 74790912.0,
|
| 8227 |
+
"step": 9130
|
| 8228 |
+
},
|
| 8229 |
+
{
|
| 8230 |
+
"epoch": 2.5885226749743335,
|
| 8231 |
+
"grad_norm": 0.9137653708457947,
|
| 8232 |
+
"learning_rate": 1.5252281548305885e-06,
|
| 8233 |
+
"loss": 0.1179,
|
| 8234 |
+
"mean_token_accuracy": 0.7632950112223625,
|
| 8235 |
+
"num_tokens": 74872832.0,
|
| 8236 |
+
"step": 9140
|
| 8237 |
+
},
|
| 8238 |
+
{
|
| 8239 |
+
"epoch": 2.5913548341416788,
|
| 8240 |
+
"grad_norm": 0.9380526542663574,
|
| 8241 |
+
"learning_rate": 1.5147382775621528e-06,
|
| 8242 |
+
"loss": 0.1242,
|
| 8243 |
+
"mean_token_accuracy": 0.7689334619790316,
|
| 8244 |
+
"num_tokens": 74954752.0,
|
| 8245 |
+
"step": 9150
|
| 8246 |
+
},
|
| 8247 |
+
{
|
| 8248 |
+
"epoch": 2.594186993309024,
|
| 8249 |
+
"grad_norm": 1.246500849723816,
|
| 8250 |
+
"learning_rate": 1.5042484002937167e-06,
|
| 8251 |
+
"loss": 0.1106,
|
| 8252 |
+
"mean_token_accuracy": 0.758524950966239,
|
| 8253 |
+
"num_tokens": 75036672.0,
|
| 8254 |
+
"step": 9160
|
| 8255 |
+
},
|
| 8256 |
+
{
|
| 8257 |
+
"epoch": 2.5970191524763693,
|
| 8258 |
+
"grad_norm": 1.2258425951004028,
|
| 8259 |
+
"learning_rate": 1.4937585230252807e-06,
|
| 8260 |
+
"loss": 0.121,
|
| 8261 |
+
"mean_token_accuracy": 0.7841976564377546,
|
| 8262 |
+
"num_tokens": 75118592.0,
|
| 8263 |
+
"step": 9170
|
| 8264 |
+
},
|
| 8265 |
+
{
|
| 8266 |
+
"epoch": 2.5998513116437145,
|
| 8267 |
+
"grad_norm": 1.4543510675430298,
|
| 8268 |
+
"learning_rate": 1.4832686457568448e-06,
|
| 8269 |
+
"loss": 0.0928,
|
| 8270 |
+
"mean_token_accuracy": 0.794337086752057,
|
| 8271 |
+
"num_tokens": 75200512.0,
|
| 8272 |
+
"step": 9180
|
| 8273 |
+
},
|
| 8274 |
+
{
|
| 8275 |
+
"epoch": 2.6026834708110593,
|
| 8276 |
+
"grad_norm": 1.4098447561264038,
|
| 8277 |
+
"learning_rate": 1.4727787684884087e-06,
|
| 8278 |
+
"loss": 0.1181,
|
| 8279 |
+
"mean_token_accuracy": 0.7629280813038349,
|
| 8280 |
+
"num_tokens": 75282432.0,
|
| 8281 |
+
"step": 9190
|
| 8282 |
+
},
|
| 8283 |
+
{
|
| 8284 |
+
"epoch": 2.605515629978405,
|
| 8285 |
+
"grad_norm": 1.3578165769577026,
|
| 8286 |
+
"learning_rate": 1.4622888912199728e-06,
|
| 8287 |
+
"loss": 0.1072,
|
| 8288 |
+
"mean_token_accuracy": 0.7831213314086198,
|
| 8289 |
+
"num_tokens": 75364352.0,
|
| 8290 |
+
"step": 9200
|
| 8291 |
+
},
|
| 8292 |
+
{
|
| 8293 |
+
"epoch": 2.60834778914575,
|
| 8294 |
+
"grad_norm": 1.7388701438903809,
|
| 8295 |
+
"learning_rate": 1.451799013951537e-06,
|
| 8296 |
+
"loss": 0.1029,
|
| 8297 |
+
"mean_token_accuracy": 0.7875366933643818,
|
| 8298 |
+
"num_tokens": 75446272.0,
|
| 8299 |
+
"step": 9210
|
| 8300 |
+
},
|
| 8301 |
+
{
|
| 8302 |
+
"epoch": 2.611179948313095,
|
| 8303 |
+
"grad_norm": 1.3704735040664673,
|
| 8304 |
+
"learning_rate": 1.441309136683101e-06,
|
| 8305 |
+
"loss": 0.1546,
|
| 8306 |
+
"mean_token_accuracy": 0.7491927597671747,
|
| 8307 |
+
"num_tokens": 75528192.0,
|
| 8308 |
+
"step": 9220
|
| 8309 |
+
},
|
| 8310 |
+
{
|
| 8311 |
+
"epoch": 2.6140121074804403,
|
| 8312 |
+
"grad_norm": 1.2139005661010742,
|
| 8313 |
+
"learning_rate": 1.430819259414665e-06,
|
| 8314 |
+
"loss": 0.1298,
|
| 8315 |
+
"mean_token_accuracy": 0.7586350254714489,
|
| 8316 |
+
"num_tokens": 75610112.0,
|
| 8317 |
+
"step": 9230
|
| 8318 |
+
},
|
| 8319 |
+
{
|
| 8320 |
+
"epoch": 2.6168442666477856,
|
| 8321 |
+
"grad_norm": 2.0187840461730957,
|
| 8322 |
+
"learning_rate": 1.420329382146229e-06,
|
| 8323 |
+
"loss": 0.1319,
|
| 8324 |
+
"mean_token_accuracy": 0.7546477496623993,
|
| 8325 |
+
"num_tokens": 75692032.0,
|
| 8326 |
+
"step": 9240
|
| 8327 |
+
},
|
| 8328 |
+
{
|
| 8329 |
+
"epoch": 2.619676425815131,
|
| 8330 |
+
"grad_norm": 1.0713800191879272,
|
| 8331 |
+
"learning_rate": 1.409839504877793e-06,
|
| 8332 |
+
"loss": 0.0989,
|
| 8333 |
+
"mean_token_accuracy": 0.7925269067287445,
|
| 8334 |
+
"num_tokens": 75773952.0,
|
| 8335 |
+
"step": 9250
|
| 8336 |
+
},
|
| 8337 |
+
{
|
| 8338 |
+
"epoch": 2.622508584982476,
|
| 8339 |
+
"grad_norm": 1.284598469734192,
|
| 8340 |
+
"learning_rate": 1.3993496276093572e-06,
|
| 8341 |
+
"loss": 0.1198,
|
| 8342 |
+
"mean_token_accuracy": 0.7871208406984807,
|
| 8343 |
+
"num_tokens": 75855872.0,
|
| 8344 |
+
"step": 9260
|
| 8345 |
+
},
|
| 8346 |
+
{
|
| 8347 |
+
"epoch": 2.6253407441498213,
|
| 8348 |
+
"grad_norm": 0.9530990123748779,
|
| 8349 |
+
"learning_rate": 1.3888597503409213e-06,
|
| 8350 |
+
"loss": 0.1194,
|
| 8351 |
+
"mean_token_accuracy": 0.7739114474505187,
|
| 8352 |
+
"num_tokens": 75937792.0,
|
| 8353 |
+
"step": 9270
|
| 8354 |
+
},
|
| 8355 |
+
{
|
| 8356 |
+
"epoch": 2.6281729033171666,
|
| 8357 |
+
"grad_norm": 1.252050757408142,
|
| 8358 |
+
"learning_rate": 1.3783698730724852e-06,
|
| 8359 |
+
"loss": 0.1143,
|
| 8360 |
+
"mean_token_accuracy": 0.7736790612339973,
|
| 8361 |
+
"num_tokens": 76019712.0,
|
| 8362 |
+
"step": 9280
|
| 8363 |
+
},
|
| 8364 |
+
{
|
| 8365 |
+
"epoch": 2.631005062484512,
|
| 8366 |
+
"grad_norm": 1.2160993814468384,
|
| 8367 |
+
"learning_rate": 1.3678799958040492e-06,
|
| 8368 |
+
"loss": 0.1035,
|
| 8369 |
+
"mean_token_accuracy": 0.7817025430500507,
|
| 8370 |
+
"num_tokens": 76101632.0,
|
| 8371 |
+
"step": 9290
|
| 8372 |
+
},
|
| 8373 |
+
{
|
| 8374 |
+
"epoch": 2.6338372216518566,
|
| 8375 |
+
"grad_norm": 1.4404122829437256,
|
| 8376 |
+
"learning_rate": 1.3573901185356133e-06,
|
| 8377 |
+
"loss": 0.1285,
|
| 8378 |
+
"mean_token_accuracy": 0.7545743621885777,
|
| 8379 |
+
"num_tokens": 76183552.0,
|
| 8380 |
+
"step": 9300
|
| 8381 |
+
},
|
| 8382 |
+
{
|
| 8383 |
+
"epoch": 2.636669380819202,
|
| 8384 |
+
"grad_norm": 1.158105492591858,
|
| 8385 |
+
"learning_rate": 1.3469002412671772e-06,
|
| 8386 |
+
"loss": 0.1127,
|
| 8387 |
+
"mean_token_accuracy": 0.7763209372758866,
|
| 8388 |
+
"num_tokens": 76265472.0,
|
| 8389 |
+
"step": 9310
|
| 8390 |
+
},
|
| 8391 |
+
{
|
| 8392 |
+
"epoch": 2.639501539986547,
|
| 8393 |
+
"grad_norm": 1.2974953651428223,
|
| 8394 |
+
"learning_rate": 1.3364103639987415e-06,
|
| 8395 |
+
"loss": 0.1254,
|
| 8396 |
+
"mean_token_accuracy": 0.7779109582304955,
|
| 8397 |
+
"num_tokens": 76347392.0,
|
| 8398 |
+
"step": 9320
|
| 8399 |
+
},
|
| 8400 |
+
{
|
| 8401 |
+
"epoch": 2.6423336991538924,
|
| 8402 |
+
"grad_norm": 1.4528638124465942,
|
| 8403 |
+
"learning_rate": 1.3259204867303055e-06,
|
| 8404 |
+
"loss": 0.1089,
|
| 8405 |
+
"mean_token_accuracy": 0.7840998075902462,
|
| 8406 |
+
"num_tokens": 76429312.0,
|
| 8407 |
+
"step": 9330
|
| 8408 |
+
},
|
| 8409 |
+
{
|
| 8410 |
+
"epoch": 2.6451658583212376,
|
| 8411 |
+
"grad_norm": 0.9896726012229919,
|
| 8412 |
+
"learning_rate": 1.3154306094618694e-06,
|
| 8413 |
+
"loss": 0.103,
|
| 8414 |
+
"mean_token_accuracy": 0.7768224064260721,
|
| 8415 |
+
"num_tokens": 76511232.0,
|
| 8416 |
+
"step": 9340
|
| 8417 |
+
},
|
| 8418 |
+
{
|
| 8419 |
+
"epoch": 2.647998017488583,
|
| 8420 |
+
"grad_norm": 1.1756311655044556,
|
| 8421 |
+
"learning_rate": 1.3049407321934335e-06,
|
| 8422 |
+
"loss": 0.1033,
|
| 8423 |
+
"mean_token_accuracy": 0.7787426613271237,
|
| 8424 |
+
"num_tokens": 76593152.0,
|
| 8425 |
+
"step": 9350
|
| 8426 |
+
},
|
| 8427 |
+
{
|
| 8428 |
+
"epoch": 2.650830176655928,
|
| 8429 |
+
"grad_norm": 1.280672550201416,
|
| 8430 |
+
"learning_rate": 1.2944508549249975e-06,
|
| 8431 |
+
"loss": 0.1049,
|
| 8432 |
+
"mean_token_accuracy": 0.7903131127357483,
|
| 8433 |
+
"num_tokens": 76675072.0,
|
| 8434 |
+
"step": 9360
|
| 8435 |
+
},
|
| 8436 |
+
{
|
| 8437 |
+
"epoch": 2.6536623358232734,
|
| 8438 |
+
"grad_norm": 1.228232979774475,
|
| 8439 |
+
"learning_rate": 1.2839609776565614e-06,
|
| 8440 |
+
"loss": 0.1449,
|
| 8441 |
+
"mean_token_accuracy": 0.743456457555294,
|
| 8442 |
+
"num_tokens": 76756992.0,
|
| 8443 |
+
"step": 9370
|
| 8444 |
+
},
|
| 8445 |
+
{
|
| 8446 |
+
"epoch": 2.6564944949906186,
|
| 8447 |
+
"grad_norm": 1.4639639854431152,
|
| 8448 |
+
"learning_rate": 1.2734711003881257e-06,
|
| 8449 |
+
"loss": 0.1358,
|
| 8450 |
+
"mean_token_accuracy": 0.7659980464726687,
|
| 8451 |
+
"num_tokens": 76838912.0,
|
| 8452 |
+
"step": 9380
|
| 8453 |
+
},
|
| 8454 |
+
{
|
| 8455 |
+
"epoch": 2.6593266541579634,
|
| 8456 |
+
"grad_norm": 1.4914389848709106,
|
| 8457 |
+
"learning_rate": 1.2629812231196898e-06,
|
| 8458 |
+
"loss": 0.1121,
|
| 8459 |
+
"mean_token_accuracy": 0.7777764193713665,
|
| 8460 |
+
"num_tokens": 76920832.0,
|
| 8461 |
+
"step": 9390
|
| 8462 |
+
},
|
| 8463 |
+
{
|
| 8464 |
+
"epoch": 2.662158813325309,
|
| 8465 |
+
"grad_norm": 1.1283109188079834,
|
| 8466 |
+
"learning_rate": 1.2524913458512536e-06,
|
| 8467 |
+
"loss": 0.113,
|
| 8468 |
+
"mean_token_accuracy": 0.7669520601630211,
|
| 8469 |
+
"num_tokens": 77002752.0,
|
| 8470 |
+
"step": 9400
|
| 8471 |
+
},
|
| 8472 |
+
{
|
| 8473 |
+
"epoch": 2.664990972492654,
|
| 8474 |
+
"grad_norm": 1.1668506860733032,
|
| 8475 |
+
"learning_rate": 1.2420014685828177e-06,
|
| 8476 |
+
"loss": 0.1224,
|
| 8477 |
+
"mean_token_accuracy": 0.7749633088707923,
|
| 8478 |
+
"num_tokens": 77084672.0,
|
| 8479 |
+
"step": 9410
|
| 8480 |
+
},
|
| 8481 |
+
{
|
| 8482 |
+
"epoch": 2.667823131659999,
|
| 8483 |
+
"grad_norm": 1.8604751825332642,
|
| 8484 |
+
"learning_rate": 1.2315115913143818e-06,
|
| 8485 |
+
"loss": 0.1349,
|
| 8486 |
+
"mean_token_accuracy": 0.7719178080558777,
|
| 8487 |
+
"num_tokens": 77166592.0,
|
| 8488 |
+
"step": 9420
|
| 8489 |
+
},
|
| 8490 |
+
{
|
| 8491 |
+
"epoch": 2.6706552908273444,
|
| 8492 |
+
"grad_norm": 2.2527692317962646,
|
| 8493 |
+
"learning_rate": 1.2210217140459456e-06,
|
| 8494 |
+
"loss": 0.1357,
|
| 8495 |
+
"mean_token_accuracy": 0.7704500976949931,
|
| 8496 |
+
"num_tokens": 77248512.0,
|
| 8497 |
+
"step": 9430
|
| 8498 |
+
},
|
| 8499 |
+
{
|
| 8500 |
+
"epoch": 2.6734874499946897,
|
| 8501 |
+
"grad_norm": 1.1649688482284546,
|
| 8502 |
+
"learning_rate": 1.21053183677751e-06,
|
| 8503 |
+
"loss": 0.1056,
|
| 8504 |
+
"mean_token_accuracy": 0.7830601751804351,
|
| 8505 |
+
"num_tokens": 77330432.0,
|
| 8506 |
+
"step": 9440
|
| 8507 |
+
},
|
| 8508 |
+
{
|
| 8509 |
+
"epoch": 2.676319609162035,
|
| 8510 |
+
"grad_norm": 1.1416834592819214,
|
| 8511 |
+
"learning_rate": 1.2000419595090738e-06,
|
| 8512 |
+
"loss": 0.1265,
|
| 8513 |
+
"mean_token_accuracy": 0.7677592922002077,
|
| 8514 |
+
"num_tokens": 77412352.0,
|
| 8515 |
+
"step": 9450
|
| 8516 |
+
},
|
| 8517 |
+
{
|
| 8518 |
+
"epoch": 2.67915176832938,
|
| 8519 |
+
"grad_norm": 1.1690260171890259,
|
| 8520 |
+
"learning_rate": 1.1895520822406379e-06,
|
| 8521 |
+
"loss": 0.1164,
|
| 8522 |
+
"mean_token_accuracy": 0.7738502897322178,
|
| 8523 |
+
"num_tokens": 77494272.0,
|
| 8524 |
+
"step": 9460
|
| 8525 |
+
},
|
| 8526 |
+
{
|
| 8527 |
+
"epoch": 2.6819839274967254,
|
| 8528 |
+
"grad_norm": 1.4305615425109863,
|
| 8529 |
+
"learning_rate": 1.179062204972202e-06,
|
| 8530 |
+
"loss": 0.1248,
|
| 8531 |
+
"mean_token_accuracy": 0.7664261247962714,
|
| 8532 |
+
"num_tokens": 77576192.0,
|
| 8533 |
+
"step": 9470
|
| 8534 |
+
},
|
| 8535 |
+
{
|
| 8536 |
+
"epoch": 2.6848160866640707,
|
| 8537 |
+
"grad_norm": 1.3226728439331055,
|
| 8538 |
+
"learning_rate": 1.168572327703766e-06,
|
| 8539 |
+
"loss": 0.1253,
|
| 8540 |
+
"mean_token_accuracy": 0.7715998016297817,
|
| 8541 |
+
"num_tokens": 77658112.0,
|
| 8542 |
+
"step": 9480
|
| 8543 |
+
},
|
| 8544 |
+
{
|
| 8545 |
+
"epoch": 2.687648245831416,
|
| 8546 |
+
"grad_norm": 1.2239925861358643,
|
| 8547 |
+
"learning_rate": 1.1580824504353299e-06,
|
| 8548 |
+
"loss": 0.1412,
|
| 8549 |
+
"mean_token_accuracy": 0.7618395283818244,
|
| 8550 |
+
"num_tokens": 77740032.0,
|
| 8551 |
+
"step": 9490
|
| 8552 |
+
},
|
| 8553 |
+
{
|
| 8554 |
+
"epoch": 2.6904804049987607,
|
| 8555 |
+
"grad_norm": 1.3090022802352905,
|
| 8556 |
+
"learning_rate": 1.1475925731668942e-06,
|
| 8557 |
+
"loss": 0.1944,
|
| 8558 |
+
"mean_token_accuracy": 0.7248899202793837,
|
| 8559 |
+
"num_tokens": 77821952.0,
|
| 8560 |
+
"step": 9500
|
| 8561 |
}
|
| 8562 |
],
|
| 8563 |
"logging_steps": 10,
|
|
|
|
| 8577 |
"attributes": {}
|
| 8578 |
}
|
| 8579 |
},
|
| 8580 |
+
"total_flos": 2.0566822331036467e+17,
|
| 8581 |
"train_batch_size": 2,
|
| 8582 |
"trial_name": null,
|
| 8583 |
"trial_params": null
|