Training in progress, step 11838, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 328277848
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b44b5dec3eb521e4966741844bbf6502227d6ed08a1303474080332dfbe45e5
|
| 3 |
size 328277848
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 318646859
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6886cac9d99b59f70ddb2a5a11358ca7a3e8d9f6f65ffabbc7d41be6c68dc0f9
|
| 3 |
size 318646859
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b172ef1a2b23540cb3d53eed9b6dcd9ee9e06553bb8c4f5a46142cb0fe60689
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8249,6 +8249,237 @@
|
|
| 8249 |
"eval_samples_per_second": 273.376,
|
| 8250 |
"eval_steps_per_second": 5.741,
|
| 8251 |
"step": 11500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8252 |
}
|
| 8253 |
],
|
| 8254 |
"logging_steps": 10,
|
|
@@ -8263,12 +8494,12 @@
|
|
| 8263 |
"should_evaluate": false,
|
| 8264 |
"should_log": false,
|
| 8265 |
"should_save": true,
|
| 8266 |
-
"should_training_stop":
|
| 8267 |
},
|
| 8268 |
"attributes": {}
|
| 8269 |
}
|
| 8270 |
},
|
| 8271 |
-
"total_flos": 3.
|
| 8272 |
"train_batch_size": 48,
|
| 8273 |
"trial_name": null,
|
| 8274 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 11838,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8249 |
"eval_samples_per_second": 273.376,
|
| 8250 |
"eval_steps_per_second": 5.741,
|
| 8251 |
"step": 11500
|
| 8252 |
+
},
|
| 8253 |
+
{
|
| 8254 |
+
"epoch": 1.9445852339922283,
|
| 8255 |
+
"grad_norm": 0.4365793466567993,
|
| 8256 |
+
"learning_rate": 8.270650415620584e-07,
|
| 8257 |
+
"loss": 4.29614372253418,
|
| 8258 |
+
"step": 11510
|
| 8259 |
+
},
|
| 8260 |
+
{
|
| 8261 |
+
"epoch": 1.9462747085656362,
|
| 8262 |
+
"grad_norm": 0.43951302766799927,
|
| 8263 |
+
"learning_rate": 7.775944524542055e-07,
|
| 8264 |
+
"loss": 4.270536422729492,
|
| 8265 |
+
"step": 11520
|
| 8266 |
+
},
|
| 8267 |
+
{
|
| 8268 |
+
"epoch": 1.9479641831390437,
|
| 8269 |
+
"grad_norm": 0.45033299922943115,
|
| 8270 |
+
"learning_rate": 7.296455308872406e-07,
|
| 8271 |
+
"loss": 4.282721710205078,
|
| 8272 |
+
"step": 11530
|
| 8273 |
+
},
|
| 8274 |
+
{
|
| 8275 |
+
"epoch": 1.9496536577124515,
|
| 8276 |
+
"grad_norm": 0.43896329402923584,
|
| 8277 |
+
"learning_rate": 6.832187658113441e-07,
|
| 8278 |
+
"loss": 4.2960052490234375,
|
| 8279 |
+
"step": 11540
|
| 8280 |
+
},
|
| 8281 |
+
{
|
| 8282 |
+
"epoch": 1.9513431322858592,
|
| 8283 |
+
"grad_norm": 0.44476914405822754,
|
| 8284 |
+
"learning_rate": 6.383146306547626e-07,
|
| 8285 |
+
"loss": 4.30328483581543,
|
| 8286 |
+
"step": 11550
|
| 8287 |
+
},
|
| 8288 |
+
{
|
| 8289 |
+
"epoch": 1.9530326068592667,
|
| 8290 |
+
"grad_norm": 0.4521080553531647,
|
| 8291 |
+
"learning_rate": 5.949335833189628e-07,
|
| 8292 |
+
"loss": 4.327355575561524,
|
| 8293 |
+
"step": 11560
|
| 8294 |
+
},
|
| 8295 |
+
{
|
| 8296 |
+
"epoch": 1.9547220814326745,
|
| 8297 |
+
"grad_norm": 0.4513733983039856,
|
| 8298 |
+
"learning_rate": 5.530760661741018e-07,
|
| 8299 |
+
"loss": 4.302740859985351,
|
| 8300 |
+
"step": 11570
|
| 8301 |
+
},
|
| 8302 |
+
{
|
| 8303 |
+
"epoch": 1.9564115560060822,
|
| 8304 |
+
"grad_norm": 0.4388287663459778,
|
| 8305 |
+
"learning_rate": 5.127425060543478e-07,
|
| 8306 |
+
"loss": 4.277825546264649,
|
| 8307 |
+
"step": 11580
|
| 8308 |
+
},
|
| 8309 |
+
{
|
| 8310 |
+
"epoch": 1.9581010305794897,
|
| 8311 |
+
"grad_norm": 0.44225117564201355,
|
| 8312 |
+
"learning_rate": 4.7393331425364943e-07,
|
| 8313 |
+
"loss": 4.280667877197265,
|
| 8314 |
+
"step": 11590
|
| 8315 |
+
},
|
| 8316 |
+
{
|
| 8317 |
+
"epoch": 1.9597905051528974,
|
| 8318 |
+
"grad_norm": 0.44035524129867554,
|
| 8319 |
+
"learning_rate": 4.3664888652144017e-07,
|
| 8320 |
+
"loss": 4.278044891357422,
|
| 8321 |
+
"step": 11600
|
| 8322 |
+
},
|
| 8323 |
+
{
|
| 8324 |
+
"epoch": 1.9614799797263052,
|
| 8325 |
+
"grad_norm": 0.44098883867263794,
|
| 8326 |
+
"learning_rate": 4.008896030587072e-07,
|
| 8327 |
+
"loss": 4.268376159667969,
|
| 8328 |
+
"step": 11610
|
| 8329 |
+
},
|
| 8330 |
+
{
|
| 8331 |
+
"epoch": 1.9631694542997127,
|
| 8332 |
+
"grad_norm": 0.43533217906951904,
|
| 8333 |
+
"learning_rate": 3.6665582851406195e-07,
|
| 8334 |
+
"loss": 4.295290756225586,
|
| 8335 |
+
"step": 11620
|
| 8336 |
+
},
|
| 8337 |
+
{
|
| 8338 |
+
"epoch": 1.9648589288731204,
|
| 8339 |
+
"grad_norm": 0.45106539130210876,
|
| 8340 |
+
"learning_rate": 3.3394791198000927e-07,
|
| 8341 |
+
"loss": 4.281253051757813,
|
| 8342 |
+
"step": 11630
|
| 8343 |
+
},
|
| 8344 |
+
{
|
| 8345 |
+
"epoch": 1.9665484034465281,
|
| 8346 |
+
"grad_norm": 0.44754281640052795,
|
| 8347 |
+
"learning_rate": 3.027661869893672e-07,
|
| 8348 |
+
"loss": 4.281909942626953,
|
| 8349 |
+
"step": 11640
|
| 8350 |
+
},
|
| 8351 |
+
{
|
| 8352 |
+
"epoch": 1.9682378780199357,
|
| 8353 |
+
"grad_norm": 0.438475638628006,
|
| 8354 |
+
"learning_rate": 2.731109715119861e-07,
|
| 8355 |
+
"loss": 4.280799484252929,
|
| 8356 |
+
"step": 11650
|
| 8357 |
+
},
|
| 8358 |
+
{
|
| 8359 |
+
"epoch": 1.9699273525933436,
|
| 8360 |
+
"grad_norm": 0.44646841287612915,
|
| 8361 |
+
"learning_rate": 2.4498256795135173e-07,
|
| 8362 |
+
"loss": 4.306585693359375,
|
| 8363 |
+
"step": 11660
|
| 8364 |
+
},
|
| 8365 |
+
{
|
| 8366 |
+
"epoch": 1.9716168271667511,
|
| 8367 |
+
"grad_norm": 0.4341582953929901,
|
| 8368 |
+
"learning_rate": 2.183812631415871e-07,
|
| 8369 |
+
"loss": 4.274542617797851,
|
| 8370 |
+
"step": 11670
|
| 8371 |
+
},
|
| 8372 |
+
{
|
| 8373 |
+
"epoch": 1.9733063017401589,
|
| 8374 |
+
"grad_norm": 0.4331877827644348,
|
| 8375 |
+
"learning_rate": 1.933073283445219e-07,
|
| 8376 |
+
"loss": 4.2908935546875,
|
| 8377 |
+
"step": 11680
|
| 8378 |
+
},
|
| 8379 |
+
{
|
| 8380 |
+
"epoch": 1.9749957763135666,
|
| 8381 |
+
"grad_norm": 0.447518914937973,
|
| 8382 |
+
"learning_rate": 1.697610192469112e-07,
|
| 8383 |
+
"loss": 4.3111217498779295,
|
| 8384 |
+
"step": 11690
|
| 8385 |
+
},
|
| 8386 |
+
{
|
| 8387 |
+
"epoch": 1.976685250886974,
|
| 8388 |
+
"grad_norm": 0.44273945689201355,
|
| 8389 |
+
"learning_rate": 1.4774257595783766e-07,
|
| 8390 |
+
"loss": 4.300546264648437,
|
| 8391 |
+
"step": 11700
|
| 8392 |
+
},
|
| 8393 |
+
{
|
| 8394 |
+
"epoch": 1.9783747254603818,
|
| 8395 |
+
"grad_norm": 0.45125117897987366,
|
| 8396 |
+
"learning_rate": 1.272522230062467e-07,
|
| 8397 |
+
"loss": 4.289936828613281,
|
| 8398 |
+
"step": 11710
|
| 8399 |
+
},
|
| 8400 |
+
{
|
| 8401 |
+
"epoch": 1.9800642000337896,
|
| 8402 |
+
"grad_norm": 0.43694615364074707,
|
| 8403 |
+
"learning_rate": 1.0829016933869838e-07,
|
| 8404 |
+
"loss": 4.289299392700196,
|
| 8405 |
+
"step": 11720
|
| 8406 |
+
},
|
| 8407 |
+
{
|
| 8408 |
+
"epoch": 1.981753674607197,
|
| 8409 |
+
"grad_norm": 0.44341588020324707,
|
| 8410 |
+
"learning_rate": 9.085660831715247e-08,
|
| 8411 |
+
"loss": 4.297845458984375,
|
| 8412 |
+
"step": 11730
|
| 8413 |
+
},
|
| 8414 |
+
{
|
| 8415 |
+
"epoch": 1.9834431491806048,
|
| 8416 |
+
"grad_norm": 0.446321576833725,
|
| 8417 |
+
"learning_rate": 7.495171771710328e-08,
|
| 8418 |
+
"loss": 4.293007659912109,
|
| 8419 |
+
"step": 11740
|
| 8420 |
+
},
|
| 8421 |
+
{
|
| 8422 |
+
"epoch": 1.9851326237540126,
|
| 8423 |
+
"grad_norm": 0.44870322942733765,
|
| 8424 |
+
"learning_rate": 6.057565972568123e-08,
|
| 8425 |
+
"loss": 4.291889190673828,
|
| 8426 |
+
"step": 11750
|
| 8427 |
+
},
|
| 8428 |
+
{
|
| 8429 |
+
"epoch": 1.98682209832742,
|
| 8430 |
+
"grad_norm": 0.44810283184051514,
|
| 8431 |
+
"learning_rate": 4.772858094005405e-08,
|
| 8432 |
+
"loss": 4.299283981323242,
|
| 8433 |
+
"step": 11760
|
| 8434 |
+
},
|
| 8435 |
+
{
|
| 8436 |
+
"epoch": 1.9885115729008278,
|
| 8437 |
+
"grad_norm": 0.44559845328330994,
|
| 8438 |
+
"learning_rate": 3.641061236591136e-08,
|
| 8439 |
+
"loss": 4.282249832153321,
|
| 8440 |
+
"step": 11770
|
| 8441 |
+
},
|
| 8442 |
+
{
|
| 8443 |
+
"epoch": 1.9902010474742355,
|
| 8444 |
+
"grad_norm": 0.4399174153804779,
|
| 8445 |
+
"learning_rate": 2.6621869416099118e-08,
|
| 8446 |
+
"loss": 4.288850021362305,
|
| 8447 |
+
"step": 11780
|
| 8448 |
+
},
|
| 8449 |
+
{
|
| 8450 |
+
"epoch": 1.991890522047643,
|
| 8451 |
+
"grad_norm": 0.4408097267150879,
|
| 8452 |
+
"learning_rate": 1.8362451909520458e-08,
|
| 8453 |
+
"loss": 4.286129760742187,
|
| 8454 |
+
"step": 11790
|
| 8455 |
+
},
|
| 8456 |
+
{
|
| 8457 |
+
"epoch": 1.993579996621051,
|
| 8458 |
+
"grad_norm": 0.44703418016433716,
|
| 8459 |
+
"learning_rate": 1.16324440700033e-08,
|
| 8460 |
+
"loss": 4.295662307739258,
|
| 8461 |
+
"step": 11800
|
| 8462 |
+
},
|
| 8463 |
+
{
|
| 8464 |
+
"epoch": 1.9952694711944585,
|
| 8465 |
+
"grad_norm": 0.45136868953704834,
|
| 8466 |
+
"learning_rate": 6.431914525567572e-09,
|
| 8467 |
+
"loss": 4.288112640380859,
|
| 8468 |
+
"step": 11810
|
| 8469 |
+
},
|
| 8470 |
+
{
|
| 8471 |
+
"epoch": 1.996958945767866,
|
| 8472 |
+
"grad_norm": 0.4494999647140503,
|
| 8473 |
+
"learning_rate": 2.760916307625871e-09,
|
| 8474 |
+
"loss": 4.304160308837891,
|
| 8475 |
+
"step": 11820
|
| 8476 |
+
},
|
| 8477 |
+
{
|
| 8478 |
+
"epoch": 1.998648420341274,
|
| 8479 |
+
"grad_norm": 0.4357486665248871,
|
| 8480 |
+
"learning_rate": 6.194868504838524e-10,
|
| 8481 |
+
"loss": 4.277233123779297,
|
| 8482 |
+
"step": 11830
|
| 8483 |
}
|
| 8484 |
],
|
| 8485 |
"logging_steps": 10,
|
|
|
|
| 8494 |
"should_evaluate": false,
|
| 8495 |
"should_log": false,
|
| 8496 |
"should_save": true,
|
| 8497 |
+
"should_training_stop": true
|
| 8498 |
},
|
| 8499 |
"attributes": {}
|
| 8500 |
}
|
| 8501 |
},
|
| 8502 |
+
"total_flos": 3.959258038224814e+17,
|
| 8503 |
"train_batch_size": 48,
|
| 8504 |
"trial_name": null,
|
| 8505 |
"trial_params": null
|