Training in progress, step 45000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +354 -4
- last-checkpoint/training_args.bin +1 -1
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c34146894dfe342922a0c8a606eac0350f76f11ea9e61c107c4fbf6ed4906e82
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:502f558a0369d8b367137ad2a3eafab0d0eba581e23c553e257aa247277dbe02
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5bd1df9dd287561b432a4ed1887fdcf8336e4c007fc54d3406e19e31c0bf33c
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a04ed282bfc2791246883061552e93b53278d67ed9004819c40202da8064598
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06efddacd0797f2209b470dd13b726f0378cede9676bfe8789e9ef52e5513689
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f815d3a5b83aa01364285e2a8f42845f4c04a057e51ff2194a37de3386990687
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a0f1ff729c197851c973f7c1a73abcd6f67c6109dc7b232bef28f6096173a7b
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -15408,10 +15408,360 @@
|
|
| 15408 |
"learning_rate": 0.0004858584078736993,
|
| 15409 |
"loss": 17.2386,
|
| 15410 |
"step": 44000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15411 |
}
|
| 15412 |
],
|
| 15413 |
"logging_steps": 20,
|
| 15414 |
-
"max_steps":
|
| 15415 |
"num_input_tokens_seen": 0,
|
| 15416 |
"num_train_epochs": 3,
|
| 15417 |
"save_steps": 1000,
|
|
@@ -15427,7 +15777,7 @@
|
|
| 15427 |
"attributes": {}
|
| 15428 |
}
|
| 15429 |
},
|
| 15430 |
-
"total_flos": 3.
|
| 15431 |
"train_batch_size": 48,
|
| 15432 |
"trial_name": null,
|
| 15433 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.06665916133887148,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 45000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 15408 |
"learning_rate": 0.0004858584078736993,
|
| 15409 |
"loss": 17.2386,
|
| 15410 |
"step": 44000
|
| 15411 |
+
},
|
| 15412 |
+
{
|
| 15413 |
+
"epoch": 0.0652074729363805,
|
| 15414 |
+
"grad_norm": 7.21875,
|
| 15415 |
+
"learning_rate": 0.0004892531243702858,
|
| 15416 |
+
"loss": 18.4759,
|
| 15417 |
+
"step": 44020
|
| 15418 |
+
},
|
| 15419 |
+
{
|
| 15420 |
+
"epoch": 0.06523709923030889,
|
| 15421 |
+
"grad_norm": 7.5,
|
| 15422 |
+
"learning_rate": 0.0004892481854352782,
|
| 15423 |
+
"loss": 18.408,
|
| 15424 |
+
"step": 44040
|
| 15425 |
+
},
|
| 15426 |
+
{
|
| 15427 |
+
"epoch": 0.06526672552423728,
|
| 15428 |
+
"grad_norm": 7.90625,
|
| 15429 |
+
"learning_rate": 0.0004892432465002707,
|
| 15430 |
+
"loss": 18.2598,
|
| 15431 |
+
"step": 44060
|
| 15432 |
+
},
|
| 15433 |
+
{
|
| 15434 |
+
"epoch": 0.06529635181816566,
|
| 15435 |
+
"grad_norm": 7.625,
|
| 15436 |
+
"learning_rate": 0.0004892383075652631,
|
| 15437 |
+
"loss": 18.2414,
|
| 15438 |
+
"step": 44080
|
| 15439 |
+
},
|
| 15440 |
+
{
|
| 15441 |
+
"epoch": 0.06532597811209405,
|
| 15442 |
+
"grad_norm": 7.5,
|
| 15443 |
+
"learning_rate": 0.0004892333686302555,
|
| 15444 |
+
"loss": 18.2764,
|
| 15445 |
+
"step": 44100
|
| 15446 |
+
},
|
| 15447 |
+
{
|
| 15448 |
+
"epoch": 0.06535560440602244,
|
| 15449 |
+
"grad_norm": 6.59375,
|
| 15450 |
+
"learning_rate": 0.000489228429695248,
|
| 15451 |
+
"loss": 18.2161,
|
| 15452 |
+
"step": 44120
|
| 15453 |
+
},
|
| 15454 |
+
{
|
| 15455 |
+
"epoch": 0.06538523069995082,
|
| 15456 |
+
"grad_norm": 7.125,
|
| 15457 |
+
"learning_rate": 0.0004892234907602404,
|
| 15458 |
+
"loss": 18.2661,
|
| 15459 |
+
"step": 44140
|
| 15460 |
+
},
|
| 15461 |
+
{
|
| 15462 |
+
"epoch": 0.06541485699387921,
|
| 15463 |
+
"grad_norm": 7.0625,
|
| 15464 |
+
"learning_rate": 0.0004892185518252329,
|
| 15465 |
+
"loss": 18.2414,
|
| 15466 |
+
"step": 44160
|
| 15467 |
+
},
|
| 15468 |
+
{
|
| 15469 |
+
"epoch": 0.0654444832878076,
|
| 15470 |
+
"grad_norm": 7.34375,
|
| 15471 |
+
"learning_rate": 0.0004892136128902253,
|
| 15472 |
+
"loss": 18.1743,
|
| 15473 |
+
"step": 44180
|
| 15474 |
+
},
|
| 15475 |
+
{
|
| 15476 |
+
"epoch": 0.06547410958173598,
|
| 15477 |
+
"grad_norm": 7.34375,
|
| 15478 |
+
"learning_rate": 0.0004892086739552177,
|
| 15479 |
+
"loss": 18.1813,
|
| 15480 |
+
"step": 44200
|
| 15481 |
+
},
|
| 15482 |
+
{
|
| 15483 |
+
"epoch": 0.06550373587566437,
|
| 15484 |
+
"grad_norm": 7.03125,
|
| 15485 |
+
"learning_rate": 0.0004892037350202102,
|
| 15486 |
+
"loss": 18.239,
|
| 15487 |
+
"step": 44220
|
| 15488 |
+
},
|
| 15489 |
+
{
|
| 15490 |
+
"epoch": 0.06553336216959275,
|
| 15491 |
+
"grad_norm": 7.375,
|
| 15492 |
+
"learning_rate": 0.0004891987960852026,
|
| 15493 |
+
"loss": 18.084,
|
| 15494 |
+
"step": 44240
|
| 15495 |
+
},
|
| 15496 |
+
{
|
| 15497 |
+
"epoch": 0.06556298846352114,
|
| 15498 |
+
"grad_norm": 7.375,
|
| 15499 |
+
"learning_rate": 0.000489193857150195,
|
| 15500 |
+
"loss": 18.1419,
|
| 15501 |
+
"step": 44260
|
| 15502 |
+
},
|
| 15503 |
+
{
|
| 15504 |
+
"epoch": 0.06559261475744953,
|
| 15505 |
+
"grad_norm": 8.1875,
|
| 15506 |
+
"learning_rate": 0.0004891889182151874,
|
| 15507 |
+
"loss": 18.1111,
|
| 15508 |
+
"step": 44280
|
| 15509 |
+
},
|
| 15510 |
+
{
|
| 15511 |
+
"epoch": 0.06562224105137791,
|
| 15512 |
+
"grad_norm": 7.09375,
|
| 15513 |
+
"learning_rate": 0.0004891839792801799,
|
| 15514 |
+
"loss": 18.0775,
|
| 15515 |
+
"step": 44300
|
| 15516 |
+
},
|
| 15517 |
+
{
|
| 15518 |
+
"epoch": 0.0656518673453063,
|
| 15519 |
+
"grad_norm": 7.34375,
|
| 15520 |
+
"learning_rate": 0.0004891790403451723,
|
| 15521 |
+
"loss": 18.1751,
|
| 15522 |
+
"step": 44320
|
| 15523 |
+
},
|
| 15524 |
+
{
|
| 15525 |
+
"epoch": 0.0656814936392347,
|
| 15526 |
+
"grad_norm": 6.96875,
|
| 15527 |
+
"learning_rate": 0.0004891741014101648,
|
| 15528 |
+
"loss": 18.1515,
|
| 15529 |
+
"step": 44340
|
| 15530 |
+
},
|
| 15531 |
+
{
|
| 15532 |
+
"epoch": 0.06571111993316309,
|
| 15533 |
+
"grad_norm": 7.6875,
|
| 15534 |
+
"learning_rate": 0.0004891691624751572,
|
| 15535 |
+
"loss": 18.1534,
|
| 15536 |
+
"step": 44360
|
| 15537 |
+
},
|
| 15538 |
+
{
|
| 15539 |
+
"epoch": 0.06574074622709147,
|
| 15540 |
+
"grad_norm": 6.96875,
|
| 15541 |
+
"learning_rate": 0.0004891642235401497,
|
| 15542 |
+
"loss": 18.1406,
|
| 15543 |
+
"step": 44380
|
| 15544 |
+
},
|
| 15545 |
+
{
|
| 15546 |
+
"epoch": 0.06577037252101986,
|
| 15547 |
+
"grad_norm": 8.625,
|
| 15548 |
+
"learning_rate": 0.000489159284605142,
|
| 15549 |
+
"loss": 18.1022,
|
| 15550 |
+
"step": 44400
|
| 15551 |
+
},
|
| 15552 |
+
{
|
| 15553 |
+
"epoch": 0.06579999881494825,
|
| 15554 |
+
"grad_norm": 7.9375,
|
| 15555 |
+
"learning_rate": 0.0004891543456701344,
|
| 15556 |
+
"loss": 18.0375,
|
| 15557 |
+
"step": 44420
|
| 15558 |
+
},
|
| 15559 |
+
{
|
| 15560 |
+
"epoch": 0.06582962510887663,
|
| 15561 |
+
"grad_norm": 8.5625,
|
| 15562 |
+
"learning_rate": 0.0004891494067351269,
|
| 15563 |
+
"loss": 18.0538,
|
| 15564 |
+
"step": 44440
|
| 15565 |
+
},
|
| 15566 |
+
{
|
| 15567 |
+
"epoch": 0.06585925140280502,
|
| 15568 |
+
"grad_norm": 6.90625,
|
| 15569 |
+
"learning_rate": 0.0004891444678001193,
|
| 15570 |
+
"loss": 18.0487,
|
| 15571 |
+
"step": 44460
|
| 15572 |
+
},
|
| 15573 |
+
{
|
| 15574 |
+
"epoch": 0.0658888776967334,
|
| 15575 |
+
"grad_norm": 7.8125,
|
| 15576 |
+
"learning_rate": 0.0004891395288651117,
|
| 15577 |
+
"loss": 18.1347,
|
| 15578 |
+
"step": 44480
|
| 15579 |
+
},
|
| 15580 |
+
{
|
| 15581 |
+
"epoch": 0.06591850399066179,
|
| 15582 |
+
"grad_norm": 7.21875,
|
| 15583 |
+
"learning_rate": 0.0004891345899301041,
|
| 15584 |
+
"loss": 18.0459,
|
| 15585 |
+
"step": 44500
|
| 15586 |
+
},
|
| 15587 |
+
{
|
| 15588 |
+
"epoch": 0.06594813028459018,
|
| 15589 |
+
"grad_norm": 8.6875,
|
| 15590 |
+
"learning_rate": 0.0004891296509950966,
|
| 15591 |
+
"loss": 18.0135,
|
| 15592 |
+
"step": 44520
|
| 15593 |
+
},
|
| 15594 |
+
{
|
| 15595 |
+
"epoch": 0.06597775657851856,
|
| 15596 |
+
"grad_norm": 7.8125,
|
| 15597 |
+
"learning_rate": 0.000489124712060089,
|
| 15598 |
+
"loss": 18.0667,
|
| 15599 |
+
"step": 44540
|
| 15600 |
+
},
|
| 15601 |
+
{
|
| 15602 |
+
"epoch": 0.06600738287244695,
|
| 15603 |
+
"grad_norm": 7.34375,
|
| 15604 |
+
"learning_rate": 0.0004891197731250815,
|
| 15605 |
+
"loss": 18.0663,
|
| 15606 |
+
"step": 44560
|
| 15607 |
+
},
|
| 15608 |
+
{
|
| 15609 |
+
"epoch": 0.06603700916637534,
|
| 15610 |
+
"grad_norm": 7.5,
|
| 15611 |
+
"learning_rate": 0.0004891148341900739,
|
| 15612 |
+
"loss": 18.035,
|
| 15613 |
+
"step": 44580
|
| 15614 |
+
},
|
| 15615 |
+
{
|
| 15616 |
+
"epoch": 0.06606663546030372,
|
| 15617 |
+
"grad_norm": 7.28125,
|
| 15618 |
+
"learning_rate": 0.0004891098952550664,
|
| 15619 |
+
"loss": 18.0706,
|
| 15620 |
+
"step": 44600
|
| 15621 |
+
},
|
| 15622 |
+
{
|
| 15623 |
+
"epoch": 0.06609626175423211,
|
| 15624 |
+
"grad_norm": 6.6875,
|
| 15625 |
+
"learning_rate": 0.0004891049563200588,
|
| 15626 |
+
"loss": 18.0513,
|
| 15627 |
+
"step": 44620
|
| 15628 |
+
},
|
| 15629 |
+
{
|
| 15630 |
+
"epoch": 0.0661258880481605,
|
| 15631 |
+
"grad_norm": 7.78125,
|
| 15632 |
+
"learning_rate": 0.0004891000173850512,
|
| 15633 |
+
"loss": 18.0185,
|
| 15634 |
+
"step": 44640
|
| 15635 |
+
},
|
| 15636 |
+
{
|
| 15637 |
+
"epoch": 0.0661555143420889,
|
| 15638 |
+
"grad_norm": 6.90625,
|
| 15639 |
+
"learning_rate": 0.0004890950784500437,
|
| 15640 |
+
"loss": 17.9851,
|
| 15641 |
+
"step": 44660
|
| 15642 |
+
},
|
| 15643 |
+
{
|
| 15644 |
+
"epoch": 0.06618514063601728,
|
| 15645 |
+
"grad_norm": 6.65625,
|
| 15646 |
+
"learning_rate": 0.0004890901395150361,
|
| 15647 |
+
"loss": 18.0043,
|
| 15648 |
+
"step": 44680
|
| 15649 |
+
},
|
| 15650 |
+
{
|
| 15651 |
+
"epoch": 0.06621476692994567,
|
| 15652 |
+
"grad_norm": 7.28125,
|
| 15653 |
+
"learning_rate": 0.0004890852005800285,
|
| 15654 |
+
"loss": 17.9712,
|
| 15655 |
+
"step": 44700
|
| 15656 |
+
},
|
| 15657 |
+
{
|
| 15658 |
+
"epoch": 0.06624439322387406,
|
| 15659 |
+
"grad_norm": 7.34375,
|
| 15660 |
+
"learning_rate": 0.000489080261645021,
|
| 15661 |
+
"loss": 18.0501,
|
| 15662 |
+
"step": 44720
|
| 15663 |
+
},
|
| 15664 |
+
{
|
| 15665 |
+
"epoch": 0.06627401951780244,
|
| 15666 |
+
"grad_norm": 7.59375,
|
| 15667 |
+
"learning_rate": 0.0004890753227100134,
|
| 15668 |
+
"loss": 18.029,
|
| 15669 |
+
"step": 44740
|
| 15670 |
+
},
|
| 15671 |
+
{
|
| 15672 |
+
"epoch": 0.06630364581173083,
|
| 15673 |
+
"grad_norm": 6.5625,
|
| 15674 |
+
"learning_rate": 0.0004890703837750059,
|
| 15675 |
+
"loss": 18.0173,
|
| 15676 |
+
"step": 44760
|
| 15677 |
+
},
|
| 15678 |
+
{
|
| 15679 |
+
"epoch": 0.06633327210565922,
|
| 15680 |
+
"grad_norm": 7.71875,
|
| 15681 |
+
"learning_rate": 0.0004890654448399983,
|
| 15682 |
+
"loss": 17.9833,
|
| 15683 |
+
"step": 44780
|
| 15684 |
+
},
|
| 15685 |
+
{
|
| 15686 |
+
"epoch": 0.0663628983995876,
|
| 15687 |
+
"grad_norm": 7.125,
|
| 15688 |
+
"learning_rate": 0.0004890605059049907,
|
| 15689 |
+
"loss": 18.0,
|
| 15690 |
+
"step": 44800
|
| 15691 |
+
},
|
| 15692 |
+
{
|
| 15693 |
+
"epoch": 0.06639252469351599,
|
| 15694 |
+
"grad_norm": 7.15625,
|
| 15695 |
+
"learning_rate": 0.0004890555669699832,
|
| 15696 |
+
"loss": 17.9968,
|
| 15697 |
+
"step": 44820
|
| 15698 |
+
},
|
| 15699 |
+
{
|
| 15700 |
+
"epoch": 0.06642215098744438,
|
| 15701 |
+
"grad_norm": 6.96875,
|
| 15702 |
+
"learning_rate": 0.0004890506280349756,
|
| 15703 |
+
"loss": 18.009,
|
| 15704 |
+
"step": 44840
|
| 15705 |
+
},
|
| 15706 |
+
{
|
| 15707 |
+
"epoch": 0.06645177728137276,
|
| 15708 |
+
"grad_norm": 6.75,
|
| 15709 |
+
"learning_rate": 0.000489045689099968,
|
| 15710 |
+
"loss": 17.9378,
|
| 15711 |
+
"step": 44860
|
| 15712 |
+
},
|
| 15713 |
+
{
|
| 15714 |
+
"epoch": 0.06648140357530115,
|
| 15715 |
+
"grad_norm": 8.625,
|
| 15716 |
+
"learning_rate": 0.0004890407501649604,
|
| 15717 |
+
"loss": 17.9775,
|
| 15718 |
+
"step": 44880
|
| 15719 |
+
},
|
| 15720 |
+
{
|
| 15721 |
+
"epoch": 0.06651102986922953,
|
| 15722 |
+
"grad_norm": 7.28125,
|
| 15723 |
+
"learning_rate": 0.0004890358112299529,
|
| 15724 |
+
"loss": 17.9533,
|
| 15725 |
+
"step": 44900
|
| 15726 |
+
},
|
| 15727 |
+
{
|
| 15728 |
+
"epoch": 0.06654065616315792,
|
| 15729 |
+
"grad_norm": 6.75,
|
| 15730 |
+
"learning_rate": 0.0004890308722949453,
|
| 15731 |
+
"loss": 18.0079,
|
| 15732 |
+
"step": 44920
|
| 15733 |
+
},
|
| 15734 |
+
{
|
| 15735 |
+
"epoch": 0.06657028245708631,
|
| 15736 |
+
"grad_norm": 6.8125,
|
| 15737 |
+
"learning_rate": 0.0004890259333599378,
|
| 15738 |
+
"loss": 17.9616,
|
| 15739 |
+
"step": 44940
|
| 15740 |
+
},
|
| 15741 |
+
{
|
| 15742 |
+
"epoch": 0.0665999087510147,
|
| 15743 |
+
"grad_norm": 7.28125,
|
| 15744 |
+
"learning_rate": 0.0004890209944249302,
|
| 15745 |
+
"loss": 17.9617,
|
| 15746 |
+
"step": 44960
|
| 15747 |
+
},
|
| 15748 |
+
{
|
| 15749 |
+
"epoch": 0.0666295350449431,
|
| 15750 |
+
"grad_norm": 7.25,
|
| 15751 |
+
"learning_rate": 0.0004890160554899227,
|
| 15752 |
+
"loss": 17.9462,
|
| 15753 |
+
"step": 44980
|
| 15754 |
+
},
|
| 15755 |
+
{
|
| 15756 |
+
"epoch": 0.06665916133887148,
|
| 15757 |
+
"grad_norm": 7.21875,
|
| 15758 |
+
"learning_rate": 0.000489011116554915,
|
| 15759 |
+
"loss": 17.8648,
|
| 15760 |
+
"step": 45000
|
| 15761 |
}
|
| 15762 |
],
|
| 15763 |
"logging_steps": 20,
|
| 15764 |
+
"max_steps": 2025228,
|
| 15765 |
"num_input_tokens_seen": 0,
|
| 15766 |
"num_train_epochs": 3,
|
| 15767 |
"save_steps": 1000,
|
|
|
|
| 15777 |
"attributes": {}
|
| 15778 |
}
|
| 15779 |
},
|
| 15780 |
+
"total_flos": 3.3082299293261365e+19,
|
| 15781 |
"train_batch_size": 48,
|
| 15782 |
"trial_name": null,
|
| 15783 |
"trial_params": null
|
last-checkpoint/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5432
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ae6fe7865a6680f0788decd4b8035db04ae39b0ae4392f872489469c00e7d58
|
| 3 |
size 5432
|