Training in progress, step 4965, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 791869518
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2002857724b9ba9c06e91b94244022cd822af76f616e23253e73b0d37445df8
|
| 3 |
size 791869518
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2375752250
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c88886b857669b8c1bda6f46dfba65789f419ec3c8ba7f50d3594ac44f9c8501
|
| 3 |
size 2375752250
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1000
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02b7e464b14d20e04557f2705a171f16ff23e7e0780b1d8336dd791f6f104a57
|
| 3 |
size 1000
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -6379,6 +6379,657 @@
|
|
| 6379 |
"eval_samples_per_second": 1109.992,
|
| 6380 |
"eval_steps_per_second": 34.688,
|
| 6381 |
"step": 4500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6382 |
}
|
| 6383 |
],
|
| 6384 |
"logging_steps": 5,
|
|
@@ -6393,12 +7044,12 @@
|
|
| 6393 |
"should_evaluate": false,
|
| 6394 |
"should_log": false,
|
| 6395 |
"should_save": true,
|
| 6396 |
-
"should_training_stop":
|
| 6397 |
},
|
| 6398 |
"attributes": {}
|
| 6399 |
}
|
| 6400 |
},
|
| 6401 |
-
"total_flos":
|
| 6402 |
"train_batch_size": 4,
|
| 6403 |
"trial_name": null,
|
| 6404 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.9999055980364392,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 4965,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 6379 |
"eval_samples_per_second": 1109.992,
|
| 6380 |
"eval_steps_per_second": 34.688,
|
| 6381 |
"step": 4500
|
| 6382 |
+
},
|
| 6383 |
+
{
|
| 6384 |
+
"epoch": 0.9072658044620662,
|
| 6385 |
+
"grad_norm": 104.1875,
|
| 6386 |
+
"learning_rate": 1.0295434198746643e-07,
|
| 6387 |
+
"loss": 96.1257,
|
| 6388 |
+
"step": 4505
|
| 6389 |
+
},
|
| 6390 |
+
{
|
| 6391 |
+
"epoch": 0.9082727587400484,
|
| 6392 |
+
"grad_norm": 105.625,
|
| 6393 |
+
"learning_rate": 1.0183527305282006e-07,
|
| 6394 |
+
"loss": 96.9505,
|
| 6395 |
+
"step": 4510
|
| 6396 |
+
},
|
| 6397 |
+
{
|
| 6398 |
+
"epoch": 0.9092797130180308,
|
| 6399 |
+
"grad_norm": 108.375,
|
| 6400 |
+
"learning_rate": 1.0071620411817368e-07,
|
| 6401 |
+
"loss": 96.6111,
|
| 6402 |
+
"step": 4515
|
| 6403 |
+
},
|
| 6404 |
+
{
|
| 6405 |
+
"epoch": 0.910286667296013,
|
| 6406 |
+
"grad_norm": 106.4375,
|
| 6407 |
+
"learning_rate": 9.95971351835273e-08,
|
| 6408 |
+
"loss": 97.3165,
|
| 6409 |
+
"step": 4520
|
| 6410 |
+
},
|
| 6411 |
+
{
|
| 6412 |
+
"epoch": 0.9112936215739954,
|
| 6413 |
+
"grad_norm": 105.375,
|
| 6414 |
+
"learning_rate": 9.847806624888093e-08,
|
| 6415 |
+
"loss": 97.2006,
|
| 6416 |
+
"step": 4525
|
| 6417 |
+
},
|
| 6418 |
+
{
|
| 6419 |
+
"epoch": 0.9123005758519778,
|
| 6420 |
+
"grad_norm": 109.0,
|
| 6421 |
+
"learning_rate": 9.735899731423455e-08,
|
| 6422 |
+
"loss": 96.5357,
|
| 6423 |
+
"step": 4530
|
| 6424 |
+
},
|
| 6425 |
+
{
|
| 6426 |
+
"epoch": 0.91330753012996,
|
| 6427 |
+
"grad_norm": 104.4375,
|
| 6428 |
+
"learning_rate": 9.623992837958818e-08,
|
| 6429 |
+
"loss": 96.6608,
|
| 6430 |
+
"step": 4535
|
| 6431 |
+
},
|
| 6432 |
+
{
|
| 6433 |
+
"epoch": 0.9143144844079424,
|
| 6434 |
+
"grad_norm": 103.4375,
|
| 6435 |
+
"learning_rate": 9.51208594449418e-08,
|
| 6436 |
+
"loss": 96.2924,
|
| 6437 |
+
"step": 4540
|
| 6438 |
+
},
|
| 6439 |
+
{
|
| 6440 |
+
"epoch": 0.9153214386859246,
|
| 6441 |
+
"grad_norm": 101.5,
|
| 6442 |
+
"learning_rate": 9.400179051029543e-08,
|
| 6443 |
+
"loss": 94.9458,
|
| 6444 |
+
"step": 4545
|
| 6445 |
+
},
|
| 6446 |
+
{
|
| 6447 |
+
"epoch": 0.916328392963907,
|
| 6448 |
+
"grad_norm": 108.875,
|
| 6449 |
+
"learning_rate": 9.288272157564905e-08,
|
| 6450 |
+
"loss": 95.3736,
|
| 6451 |
+
"step": 4550
|
| 6452 |
+
},
|
| 6453 |
+
{
|
| 6454 |
+
"epoch": 0.9173353472418893,
|
| 6455 |
+
"grad_norm": 106.0,
|
| 6456 |
+
"learning_rate": 9.176365264100267e-08,
|
| 6457 |
+
"loss": 94.3943,
|
| 6458 |
+
"step": 4555
|
| 6459 |
+
},
|
| 6460 |
+
{
|
| 6461 |
+
"epoch": 0.9183423015198716,
|
| 6462 |
+
"grad_norm": 105.4375,
|
| 6463 |
+
"learning_rate": 9.06445837063563e-08,
|
| 6464 |
+
"loss": 97.27,
|
| 6465 |
+
"step": 4560
|
| 6466 |
+
},
|
| 6467 |
+
{
|
| 6468 |
+
"epoch": 0.919349255797854,
|
| 6469 |
+
"grad_norm": 106.3125,
|
| 6470 |
+
"learning_rate": 8.952551477170993e-08,
|
| 6471 |
+
"loss": 95.4415,
|
| 6472 |
+
"step": 4565
|
| 6473 |
+
},
|
| 6474 |
+
{
|
| 6475 |
+
"epoch": 0.9203562100758362,
|
| 6476 |
+
"grad_norm": 107.1875,
|
| 6477 |
+
"learning_rate": 8.840644583706356e-08,
|
| 6478 |
+
"loss": 96.8434,
|
| 6479 |
+
"step": 4570
|
| 6480 |
+
},
|
| 6481 |
+
{
|
| 6482 |
+
"epoch": 0.9213631643538186,
|
| 6483 |
+
"grad_norm": 105.1875,
|
| 6484 |
+
"learning_rate": 8.728737690241718e-08,
|
| 6485 |
+
"loss": 96.2896,
|
| 6486 |
+
"step": 4575
|
| 6487 |
+
},
|
| 6488 |
+
{
|
| 6489 |
+
"epoch": 0.9223701186318009,
|
| 6490 |
+
"grad_norm": 104.875,
|
| 6491 |
+
"learning_rate": 8.616830796777082e-08,
|
| 6492 |
+
"loss": 97.0949,
|
| 6493 |
+
"step": 4580
|
| 6494 |
+
},
|
| 6495 |
+
{
|
| 6496 |
+
"epoch": 0.9233770729097832,
|
| 6497 |
+
"grad_norm": 107.375,
|
| 6498 |
+
"learning_rate": 8.504923903312444e-08,
|
| 6499 |
+
"loss": 96.0602,
|
| 6500 |
+
"step": 4585
|
| 6501 |
+
},
|
| 6502 |
+
{
|
| 6503 |
+
"epoch": 0.9243840271877655,
|
| 6504 |
+
"grad_norm": 105.0,
|
| 6505 |
+
"learning_rate": 8.393017009847807e-08,
|
| 6506 |
+
"loss": 96.6697,
|
| 6507 |
+
"step": 4590
|
| 6508 |
+
},
|
| 6509 |
+
{
|
| 6510 |
+
"epoch": 0.9253909814657478,
|
| 6511 |
+
"grad_norm": 103.5,
|
| 6512 |
+
"learning_rate": 8.281110116383169e-08,
|
| 6513 |
+
"loss": 95.5824,
|
| 6514 |
+
"step": 4595
|
| 6515 |
+
},
|
| 6516 |
+
{
|
| 6517 |
+
"epoch": 0.9263979357437301,
|
| 6518 |
+
"grad_norm": 107.5625,
|
| 6519 |
+
"learning_rate": 8.169203222918532e-08,
|
| 6520 |
+
"loss": 96.6081,
|
| 6521 |
+
"step": 4600
|
| 6522 |
+
},
|
| 6523 |
+
{
|
| 6524 |
+
"epoch": 0.9274048900217124,
|
| 6525 |
+
"grad_norm": 108.4375,
|
| 6526 |
+
"learning_rate": 8.057296329453894e-08,
|
| 6527 |
+
"loss": 96.3714,
|
| 6528 |
+
"step": 4605
|
| 6529 |
+
},
|
| 6530 |
+
{
|
| 6531 |
+
"epoch": 0.9284118442996948,
|
| 6532 |
+
"grad_norm": 105.3125,
|
| 6533 |
+
"learning_rate": 7.945389435989256e-08,
|
| 6534 |
+
"loss": 95.8521,
|
| 6535 |
+
"step": 4610
|
| 6536 |
+
},
|
| 6537 |
+
{
|
| 6538 |
+
"epoch": 0.9294187985776771,
|
| 6539 |
+
"grad_norm": 108.125,
|
| 6540 |
+
"learning_rate": 7.833482542524619e-08,
|
| 6541 |
+
"loss": 96.356,
|
| 6542 |
+
"step": 4615
|
| 6543 |
+
},
|
| 6544 |
+
{
|
| 6545 |
+
"epoch": 0.9304257528556594,
|
| 6546 |
+
"grad_norm": 105.9375,
|
| 6547 |
+
"learning_rate": 7.721575649059981e-08,
|
| 6548 |
+
"loss": 96.4865,
|
| 6549 |
+
"step": 4620
|
| 6550 |
+
},
|
| 6551 |
+
{
|
| 6552 |
+
"epoch": 0.9314327071336417,
|
| 6553 |
+
"grad_norm": 105.6875,
|
| 6554 |
+
"learning_rate": 7.609668755595345e-08,
|
| 6555 |
+
"loss": 95.1476,
|
| 6556 |
+
"step": 4625
|
| 6557 |
+
},
|
| 6558 |
+
{
|
| 6559 |
+
"epoch": 0.932439661411624,
|
| 6560 |
+
"grad_norm": 106.9375,
|
| 6561 |
+
"learning_rate": 7.497761862130707e-08,
|
| 6562 |
+
"loss": 95.1061,
|
| 6563 |
+
"step": 4630
|
| 6564 |
+
},
|
| 6565 |
+
{
|
| 6566 |
+
"epoch": 0.9334466156896063,
|
| 6567 |
+
"grad_norm": 105.5625,
|
| 6568 |
+
"learning_rate": 7.38585496866607e-08,
|
| 6569 |
+
"loss": 95.2852,
|
| 6570 |
+
"step": 4635
|
| 6571 |
+
},
|
| 6572 |
+
{
|
| 6573 |
+
"epoch": 0.9344535699675887,
|
| 6574 |
+
"grad_norm": 107.5625,
|
| 6575 |
+
"learning_rate": 7.273948075201432e-08,
|
| 6576 |
+
"loss": 95.0002,
|
| 6577 |
+
"step": 4640
|
| 6578 |
+
},
|
| 6579 |
+
{
|
| 6580 |
+
"epoch": 0.935460524245571,
|
| 6581 |
+
"grad_norm": 106.5625,
|
| 6582 |
+
"learning_rate": 7.162041181736795e-08,
|
| 6583 |
+
"loss": 97.3515,
|
| 6584 |
+
"step": 4645
|
| 6585 |
+
},
|
| 6586 |
+
{
|
| 6587 |
+
"epoch": 0.9364674785235533,
|
| 6588 |
+
"grad_norm": 106.875,
|
| 6589 |
+
"learning_rate": 7.050134288272157e-08,
|
| 6590 |
+
"loss": 96.8893,
|
| 6591 |
+
"step": 4650
|
| 6592 |
+
},
|
| 6593 |
+
{
|
| 6594 |
+
"epoch": 0.9374744328015356,
|
| 6595 |
+
"grad_norm": 106.125,
|
| 6596 |
+
"learning_rate": 6.938227394807519e-08,
|
| 6597 |
+
"loss": 96.1281,
|
| 6598 |
+
"step": 4655
|
| 6599 |
+
},
|
| 6600 |
+
{
|
| 6601 |
+
"epoch": 0.9384813870795179,
|
| 6602 |
+
"grad_norm": 105.4375,
|
| 6603 |
+
"learning_rate": 6.826320501342882e-08,
|
| 6604 |
+
"loss": 95.932,
|
| 6605 |
+
"step": 4660
|
| 6606 |
+
},
|
| 6607 |
+
{
|
| 6608 |
+
"epoch": 0.9394883413575003,
|
| 6609 |
+
"grad_norm": 106.0625,
|
| 6610 |
+
"learning_rate": 6.714413607878245e-08,
|
| 6611 |
+
"loss": 96.195,
|
| 6612 |
+
"step": 4665
|
| 6613 |
+
},
|
| 6614 |
+
{
|
| 6615 |
+
"epoch": 0.9404952956354825,
|
| 6616 |
+
"grad_norm": 106.1875,
|
| 6617 |
+
"learning_rate": 6.602506714413608e-08,
|
| 6618 |
+
"loss": 94.7684,
|
| 6619 |
+
"step": 4670
|
| 6620 |
+
},
|
| 6621 |
+
{
|
| 6622 |
+
"epoch": 0.9415022499134649,
|
| 6623 |
+
"grad_norm": 109.0,
|
| 6624 |
+
"learning_rate": 6.49059982094897e-08,
|
| 6625 |
+
"loss": 96.4495,
|
| 6626 |
+
"step": 4675
|
| 6627 |
+
},
|
| 6628 |
+
{
|
| 6629 |
+
"epoch": 0.9425092041914472,
|
| 6630 |
+
"grad_norm": 109.0,
|
| 6631 |
+
"learning_rate": 6.378692927484333e-08,
|
| 6632 |
+
"loss": 96.9962,
|
| 6633 |
+
"step": 4680
|
| 6634 |
+
},
|
| 6635 |
+
{
|
| 6636 |
+
"epoch": 0.9435161584694295,
|
| 6637 |
+
"grad_norm": 104.3125,
|
| 6638 |
+
"learning_rate": 6.266786034019696e-08,
|
| 6639 |
+
"loss": 94.3069,
|
| 6640 |
+
"step": 4685
|
| 6641 |
+
},
|
| 6642 |
+
{
|
| 6643 |
+
"epoch": 0.9445231127474119,
|
| 6644 |
+
"grad_norm": 107.625,
|
| 6645 |
+
"learning_rate": 6.154879140555059e-08,
|
| 6646 |
+
"loss": 96.7521,
|
| 6647 |
+
"step": 4690
|
| 6648 |
+
},
|
| 6649 |
+
{
|
| 6650 |
+
"epoch": 0.9455300670253941,
|
| 6651 |
+
"grad_norm": 104.3125,
|
| 6652 |
+
"learning_rate": 6.042972247090421e-08,
|
| 6653 |
+
"loss": 96.0066,
|
| 6654 |
+
"step": 4695
|
| 6655 |
+
},
|
| 6656 |
+
{
|
| 6657 |
+
"epoch": 0.9465370213033765,
|
| 6658 |
+
"grad_norm": 104.875,
|
| 6659 |
+
"learning_rate": 5.931065353625783e-08,
|
| 6660 |
+
"loss": 94.7801,
|
| 6661 |
+
"step": 4700
|
| 6662 |
+
},
|
| 6663 |
+
{
|
| 6664 |
+
"epoch": 0.9475439755813587,
|
| 6665 |
+
"grad_norm": 106.375,
|
| 6666 |
+
"learning_rate": 5.819158460161146e-08,
|
| 6667 |
+
"loss": 95.1509,
|
| 6668 |
+
"step": 4705
|
| 6669 |
+
},
|
| 6670 |
+
{
|
| 6671 |
+
"epoch": 0.9485509298593411,
|
| 6672 |
+
"grad_norm": 104.9375,
|
| 6673 |
+
"learning_rate": 5.7072515666965083e-08,
|
| 6674 |
+
"loss": 95.5377,
|
| 6675 |
+
"step": 4710
|
| 6676 |
+
},
|
| 6677 |
+
{
|
| 6678 |
+
"epoch": 0.9495578841373233,
|
| 6679 |
+
"grad_norm": 105.0,
|
| 6680 |
+
"learning_rate": 5.595344673231871e-08,
|
| 6681 |
+
"loss": 96.0342,
|
| 6682 |
+
"step": 4715
|
| 6683 |
+
},
|
| 6684 |
+
{
|
| 6685 |
+
"epoch": 0.9505648384153057,
|
| 6686 |
+
"grad_norm": 106.8125,
|
| 6687 |
+
"learning_rate": 5.483437779767233e-08,
|
| 6688 |
+
"loss": 95.7919,
|
| 6689 |
+
"step": 4720
|
| 6690 |
+
},
|
| 6691 |
+
{
|
| 6692 |
+
"epoch": 0.9515717926932881,
|
| 6693 |
+
"grad_norm": 104.0625,
|
| 6694 |
+
"learning_rate": 5.3715308863025955e-08,
|
| 6695 |
+
"loss": 95.4794,
|
| 6696 |
+
"step": 4725
|
| 6697 |
+
},
|
| 6698 |
+
{
|
| 6699 |
+
"epoch": 0.9525787469712703,
|
| 6700 |
+
"grad_norm": 105.1875,
|
| 6701 |
+
"learning_rate": 5.2596239928379586e-08,
|
| 6702 |
+
"loss": 96.2796,
|
| 6703 |
+
"step": 4730
|
| 6704 |
+
},
|
| 6705 |
+
{
|
| 6706 |
+
"epoch": 0.9535857012492527,
|
| 6707 |
+
"grad_norm": 107.4375,
|
| 6708 |
+
"learning_rate": 5.147717099373322e-08,
|
| 6709 |
+
"loss": 96.9097,
|
| 6710 |
+
"step": 4735
|
| 6711 |
+
},
|
| 6712 |
+
{
|
| 6713 |
+
"epoch": 0.9545926555272349,
|
| 6714 |
+
"grad_norm": 104.0,
|
| 6715 |
+
"learning_rate": 5.035810205908684e-08,
|
| 6716 |
+
"loss": 95.2215,
|
| 6717 |
+
"step": 4740
|
| 6718 |
+
},
|
| 6719 |
+
{
|
| 6720 |
+
"epoch": 0.9555996098052173,
|
| 6721 |
+
"grad_norm": 103.875,
|
| 6722 |
+
"learning_rate": 4.9239033124440465e-08,
|
| 6723 |
+
"loss": 95.531,
|
| 6724 |
+
"step": 4745
|
| 6725 |
+
},
|
| 6726 |
+
{
|
| 6727 |
+
"epoch": 0.9566065640831996,
|
| 6728 |
+
"grad_norm": 104.5625,
|
| 6729 |
+
"learning_rate": 4.811996418979409e-08,
|
| 6730 |
+
"loss": 94.8213,
|
| 6731 |
+
"step": 4750
|
| 6732 |
+
},
|
| 6733 |
+
{
|
| 6734 |
+
"epoch": 0.9576135183611819,
|
| 6735 |
+
"grad_norm": 103.125,
|
| 6736 |
+
"learning_rate": 4.700089525514771e-08,
|
| 6737 |
+
"loss": 94.9124,
|
| 6738 |
+
"step": 4755
|
| 6739 |
+
},
|
| 6740 |
+
{
|
| 6741 |
+
"epoch": 0.9586204726391643,
|
| 6742 |
+
"grad_norm": 108.75,
|
| 6743 |
+
"learning_rate": 4.588182632050134e-08,
|
| 6744 |
+
"loss": 94.8872,
|
| 6745 |
+
"step": 4760
|
| 6746 |
+
},
|
| 6747 |
+
{
|
| 6748 |
+
"epoch": 0.9596274269171465,
|
| 6749 |
+
"grad_norm": 106.0625,
|
| 6750 |
+
"learning_rate": 4.476275738585497e-08,
|
| 6751 |
+
"loss": 94.9003,
|
| 6752 |
+
"step": 4765
|
| 6753 |
+
},
|
| 6754 |
+
{
|
| 6755 |
+
"epoch": 0.9606343811951289,
|
| 6756 |
+
"grad_norm": 109.0,
|
| 6757 |
+
"learning_rate": 4.364368845120859e-08,
|
| 6758 |
+
"loss": 97.4909,
|
| 6759 |
+
"step": 4770
|
| 6760 |
+
},
|
| 6761 |
+
{
|
| 6762 |
+
"epoch": 0.9616413354731111,
|
| 6763 |
+
"grad_norm": 105.1875,
|
| 6764 |
+
"learning_rate": 4.252461951656222e-08,
|
| 6765 |
+
"loss": 95.4977,
|
| 6766 |
+
"step": 4775
|
| 6767 |
+
},
|
| 6768 |
+
{
|
| 6769 |
+
"epoch": 0.9626482897510935,
|
| 6770 |
+
"grad_norm": 103.4375,
|
| 6771 |
+
"learning_rate": 4.1405550581915846e-08,
|
| 6772 |
+
"loss": 95.1702,
|
| 6773 |
+
"step": 4780
|
| 6774 |
+
},
|
| 6775 |
+
{
|
| 6776 |
+
"epoch": 0.9636552440290758,
|
| 6777 |
+
"grad_norm": 105.4375,
|
| 6778 |
+
"learning_rate": 4.028648164726947e-08,
|
| 6779 |
+
"loss": 95.1124,
|
| 6780 |
+
"step": 4785
|
| 6781 |
+
},
|
| 6782 |
+
{
|
| 6783 |
+
"epoch": 0.9646621983070581,
|
| 6784 |
+
"grad_norm": 107.125,
|
| 6785 |
+
"learning_rate": 3.9167412712623094e-08,
|
| 6786 |
+
"loss": 95.5008,
|
| 6787 |
+
"step": 4790
|
| 6788 |
+
},
|
| 6789 |
+
{
|
| 6790 |
+
"epoch": 0.9656691525850404,
|
| 6791 |
+
"grad_norm": 103.8125,
|
| 6792 |
+
"learning_rate": 3.8048343777976725e-08,
|
| 6793 |
+
"loss": 96.745,
|
| 6794 |
+
"step": 4795
|
| 6795 |
+
},
|
| 6796 |
+
{
|
| 6797 |
+
"epoch": 0.9666761068630227,
|
| 6798 |
+
"grad_norm": 103.875,
|
| 6799 |
+
"learning_rate": 3.692927484333035e-08,
|
| 6800 |
+
"loss": 96.3884,
|
| 6801 |
+
"step": 4800
|
| 6802 |
+
},
|
| 6803 |
+
{
|
| 6804 |
+
"epoch": 0.9676830611410051,
|
| 6805 |
+
"grad_norm": 103.375,
|
| 6806 |
+
"learning_rate": 3.581020590868397e-08,
|
| 6807 |
+
"loss": 94.6912,
|
| 6808 |
+
"step": 4805
|
| 6809 |
+
},
|
| 6810 |
+
{
|
| 6811 |
+
"epoch": 0.9686900154189874,
|
| 6812 |
+
"grad_norm": 106.125,
|
| 6813 |
+
"learning_rate": 3.4691136974037597e-08,
|
| 6814 |
+
"loss": 95.0865,
|
| 6815 |
+
"step": 4810
|
| 6816 |
+
},
|
| 6817 |
+
{
|
| 6818 |
+
"epoch": 0.9696969696969697,
|
| 6819 |
+
"grad_norm": 103.625,
|
| 6820 |
+
"learning_rate": 3.357206803939123e-08,
|
| 6821 |
+
"loss": 93.7961,
|
| 6822 |
+
"step": 4815
|
| 6823 |
+
},
|
| 6824 |
+
{
|
| 6825 |
+
"epoch": 0.970703923974952,
|
| 6826 |
+
"grad_norm": 104.3125,
|
| 6827 |
+
"learning_rate": 3.245299910474485e-08,
|
| 6828 |
+
"loss": 95.4935,
|
| 6829 |
+
"step": 4820
|
| 6830 |
+
},
|
| 6831 |
+
{
|
| 6832 |
+
"epoch": 0.9717108782529343,
|
| 6833 |
+
"grad_norm": 102.0625,
|
| 6834 |
+
"learning_rate": 3.133393017009848e-08,
|
| 6835 |
+
"loss": 95.072,
|
| 6836 |
+
"step": 4825
|
| 6837 |
+
},
|
| 6838 |
+
{
|
| 6839 |
+
"epoch": 0.9727178325309166,
|
| 6840 |
+
"grad_norm": 107.1875,
|
| 6841 |
+
"learning_rate": 3.0214861235452106e-08,
|
| 6842 |
+
"loss": 95.2949,
|
| 6843 |
+
"step": 4830
|
| 6844 |
+
},
|
| 6845 |
+
{
|
| 6846 |
+
"epoch": 0.973724786808899,
|
| 6847 |
+
"grad_norm": 105.375,
|
| 6848 |
+
"learning_rate": 2.909579230080573e-08,
|
| 6849 |
+
"loss": 94.6876,
|
| 6850 |
+
"step": 4835
|
| 6851 |
+
},
|
| 6852 |
+
{
|
| 6853 |
+
"epoch": 0.9747317410868813,
|
| 6854 |
+
"grad_norm": 104.5625,
|
| 6855 |
+
"learning_rate": 2.7976723366159354e-08,
|
| 6856 |
+
"loss": 96.9518,
|
| 6857 |
+
"step": 4840
|
| 6858 |
+
},
|
| 6859 |
+
{
|
| 6860 |
+
"epoch": 0.9757386953648636,
|
| 6861 |
+
"grad_norm": 106.1875,
|
| 6862 |
+
"learning_rate": 2.6857654431512978e-08,
|
| 6863 |
+
"loss": 95.4756,
|
| 6864 |
+
"step": 4845
|
| 6865 |
+
},
|
| 6866 |
+
{
|
| 6867 |
+
"epoch": 0.9767456496428459,
|
| 6868 |
+
"grad_norm": 107.5625,
|
| 6869 |
+
"learning_rate": 2.573858549686661e-08,
|
| 6870 |
+
"loss": 94.8716,
|
| 6871 |
+
"step": 4850
|
| 6872 |
+
},
|
| 6873 |
+
{
|
| 6874 |
+
"epoch": 0.9777526039208282,
|
| 6875 |
+
"grad_norm": 103.4375,
|
| 6876 |
+
"learning_rate": 2.4619516562220232e-08,
|
| 6877 |
+
"loss": 95.0801,
|
| 6878 |
+
"step": 4855
|
| 6879 |
+
},
|
| 6880 |
+
{
|
| 6881 |
+
"epoch": 0.9787595581988106,
|
| 6882 |
+
"grad_norm": 102.75,
|
| 6883 |
+
"learning_rate": 2.3500447627573856e-08,
|
| 6884 |
+
"loss": 95.3318,
|
| 6885 |
+
"step": 4860
|
| 6886 |
+
},
|
| 6887 |
+
{
|
| 6888 |
+
"epoch": 0.9797665124767928,
|
| 6889 |
+
"grad_norm": 107.75,
|
| 6890 |
+
"learning_rate": 2.2381378692927484e-08,
|
| 6891 |
+
"loss": 95.9054,
|
| 6892 |
+
"step": 4865
|
| 6893 |
+
},
|
| 6894 |
+
{
|
| 6895 |
+
"epoch": 0.9807734667547752,
|
| 6896 |
+
"grad_norm": 103.1875,
|
| 6897 |
+
"learning_rate": 2.126230975828111e-08,
|
| 6898 |
+
"loss": 95.6948,
|
| 6899 |
+
"step": 4870
|
| 6900 |
+
},
|
| 6901 |
+
{
|
| 6902 |
+
"epoch": 0.9817804210327575,
|
| 6903 |
+
"grad_norm": 107.5625,
|
| 6904 |
+
"learning_rate": 2.0143240823634735e-08,
|
| 6905 |
+
"loss": 95.1651,
|
| 6906 |
+
"step": 4875
|
| 6907 |
+
},
|
| 6908 |
+
{
|
| 6909 |
+
"epoch": 0.9827873753107398,
|
| 6910 |
+
"grad_norm": 102.3125,
|
| 6911 |
+
"learning_rate": 1.9024171888988362e-08,
|
| 6912 |
+
"loss": 95.8977,
|
| 6913 |
+
"step": 4880
|
| 6914 |
+
},
|
| 6915 |
+
{
|
| 6916 |
+
"epoch": 0.9837943295887221,
|
| 6917 |
+
"grad_norm": 107.3125,
|
| 6918 |
+
"learning_rate": 1.7905102954341986e-08,
|
| 6919 |
+
"loss": 94.0943,
|
| 6920 |
+
"step": 4885
|
| 6921 |
+
},
|
| 6922 |
+
{
|
| 6923 |
+
"epoch": 0.9848012838667044,
|
| 6924 |
+
"grad_norm": 105.8125,
|
| 6925 |
+
"learning_rate": 1.6786034019695614e-08,
|
| 6926 |
+
"loss": 96.5686,
|
| 6927 |
+
"step": 4890
|
| 6928 |
+
},
|
| 6929 |
+
{
|
| 6930 |
+
"epoch": 0.9858082381446868,
|
| 6931 |
+
"grad_norm": 104.75,
|
| 6932 |
+
"learning_rate": 1.566696508504924e-08,
|
| 6933 |
+
"loss": 96.2139,
|
| 6934 |
+
"step": 4895
|
| 6935 |
+
},
|
| 6936 |
+
{
|
| 6937 |
+
"epoch": 0.986815192422669,
|
| 6938 |
+
"grad_norm": 106.5625,
|
| 6939 |
+
"learning_rate": 1.4547896150402865e-08,
|
| 6940 |
+
"loss": 96.4123,
|
| 6941 |
+
"step": 4900
|
| 6942 |
+
},
|
| 6943 |
+
{
|
| 6944 |
+
"epoch": 0.9878221467006514,
|
| 6945 |
+
"grad_norm": 107.5,
|
| 6946 |
+
"learning_rate": 1.3428827215756489e-08,
|
| 6947 |
+
"loss": 95.4067,
|
| 6948 |
+
"step": 4905
|
| 6949 |
+
},
|
| 6950 |
+
{
|
| 6951 |
+
"epoch": 0.9888291009786336,
|
| 6952 |
+
"grad_norm": 106.125,
|
| 6953 |
+
"learning_rate": 1.2309758281110116e-08,
|
| 6954 |
+
"loss": 96.4161,
|
| 6955 |
+
"step": 4910
|
| 6956 |
+
},
|
| 6957 |
+
{
|
| 6958 |
+
"epoch": 0.989836055256616,
|
| 6959 |
+
"grad_norm": 104.6875,
|
| 6960 |
+
"learning_rate": 1.1190689346463742e-08,
|
| 6961 |
+
"loss": 94.9028,
|
| 6962 |
+
"step": 4915
|
| 6963 |
+
},
|
| 6964 |
+
{
|
| 6965 |
+
"epoch": 0.9908430095345984,
|
| 6966 |
+
"grad_norm": 106.0625,
|
| 6967 |
+
"learning_rate": 1.0071620411817367e-08,
|
| 6968 |
+
"loss": 96.9095,
|
| 6969 |
+
"step": 4920
|
| 6970 |
+
},
|
| 6971 |
+
{
|
| 6972 |
+
"epoch": 0.9918499638125806,
|
| 6973 |
+
"grad_norm": 106.8125,
|
| 6974 |
+
"learning_rate": 8.952551477170993e-09,
|
| 6975 |
+
"loss": 94.9621,
|
| 6976 |
+
"step": 4925
|
| 6977 |
+
},
|
| 6978 |
+
{
|
| 6979 |
+
"epoch": 0.992856918090563,
|
| 6980 |
+
"grad_norm": 105.75,
|
| 6981 |
+
"learning_rate": 7.83348254252462e-09,
|
| 6982 |
+
"loss": 95.0764,
|
| 6983 |
+
"step": 4930
|
| 6984 |
+
},
|
| 6985 |
+
{
|
| 6986 |
+
"epoch": 0.9938638723685452,
|
| 6987 |
+
"grad_norm": 107.5,
|
| 6988 |
+
"learning_rate": 6.7144136078782444e-09,
|
| 6989 |
+
"loss": 96.6513,
|
| 6990 |
+
"step": 4935
|
| 6991 |
+
},
|
| 6992 |
+
{
|
| 6993 |
+
"epoch": 0.9948708266465276,
|
| 6994 |
+
"grad_norm": 104.1875,
|
| 6995 |
+
"learning_rate": 5.595344673231871e-09,
|
| 6996 |
+
"loss": 94.489,
|
| 6997 |
+
"step": 4940
|
| 6998 |
+
},
|
| 6999 |
+
{
|
| 7000 |
+
"epoch": 0.9958777809245098,
|
| 7001 |
+
"grad_norm": 105.75,
|
| 7002 |
+
"learning_rate": 4.4762757385854966e-09,
|
| 7003 |
+
"loss": 95.3881,
|
| 7004 |
+
"step": 4945
|
| 7005 |
+
},
|
| 7006 |
+
{
|
| 7007 |
+
"epoch": 0.9968847352024922,
|
| 7008 |
+
"grad_norm": 108.0625,
|
| 7009 |
+
"learning_rate": 3.3572068039391222e-09,
|
| 7010 |
+
"loss": 95.4261,
|
| 7011 |
+
"step": 4950
|
| 7012 |
+
},
|
| 7013 |
+
{
|
| 7014 |
+
"epoch": 0.9978916894804746,
|
| 7015 |
+
"grad_norm": 105.9375,
|
| 7016 |
+
"learning_rate": 2.2381378692927483e-09,
|
| 7017 |
+
"loss": 95.8491,
|
| 7018 |
+
"step": 4955
|
| 7019 |
+
},
|
| 7020 |
+
{
|
| 7021 |
+
"epoch": 0.9988986437584568,
|
| 7022 |
+
"grad_norm": 104.6875,
|
| 7023 |
+
"learning_rate": 1.1190689346463741e-09,
|
| 7024 |
+
"loss": 94.8424,
|
| 7025 |
+
"step": 4960
|
| 7026 |
+
},
|
| 7027 |
+
{
|
| 7028 |
+
"epoch": 0.9999055980364392,
|
| 7029 |
+
"grad_norm": 101.75,
|
| 7030 |
+
"learning_rate": 0.0,
|
| 7031 |
+
"loss": 94.7523,
|
| 7032 |
+
"step": 4965
|
| 7033 |
}
|
| 7034 |
],
|
| 7035 |
"logging_steps": 5,
|
|
|
|
| 7044 |
"should_evaluate": false,
|
| 7045 |
"should_log": false,
|
| 7046 |
"should_save": true,
|
| 7047 |
+
"should_training_stop": true
|
| 7048 |
},
|
| 7049 |
"attributes": {}
|
| 7050 |
}
|
| 7051 |
},
|
| 7052 |
+
"total_flos": 2.151015743419633e+19,
|
| 7053 |
"train_batch_size": 4,
|
| 7054 |
"trial_name": null,
|
| 7055 |
"trial_params": null
|