Training in progress, step 20000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 373077376
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df03cae2dd432c211456aab943782bf83ba84e08565c4c981659cb89c83a578e
|
| 3 |
size 373077376
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 209816139
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4292287a7fa690fe53e7b389faee8373877f88d995cc45d3321aeb77bf8c4af6
|
| 3 |
size 209816139
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22e8bb13b8b5cd110e015717953ca96d5c03c35ddfe30ca45c1fab9651d07421
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76ace0471241ab08ffd32878e593821b741d6b0b68bcb601ea44671e5ef83eef
|
| 3 |
size 14917
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1401
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba4436ed0869bacf238e760f8e2f2044a22ff86693a77a3015046ef89f00fc7e
|
| 3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 2000,
|
| 3 |
"best_metric": 9.218317031860352,
|
| 4 |
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5480,6 +5480,294 @@
|
|
| 5480 |
"eval_samples_per_second": 50.899,
|
| 5481 |
"eval_steps_per_second": 3.189,
|
| 5482 |
"step": 19000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5483 |
}
|
| 5484 |
],
|
| 5485 |
"logging_steps": 25,
|
|
@@ -5499,7 +5787,7 @@
|
|
| 5499 |
"attributes": {}
|
| 5500 |
}
|
| 5501 |
},
|
| 5502 |
-
"total_flos": 2.
|
| 5503 |
"train_batch_size": 8,
|
| 5504 |
"trial_name": null,
|
| 5505 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 2000,
|
| 3 |
"best_metric": 9.218317031860352,
|
| 4 |
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
|
| 5 |
+
"epoch": 0.06246213233227356,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 20000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5480 |
"eval_samples_per_second": 50.899,
|
| 5481 |
"eval_steps_per_second": 3.189,
|
| 5482 |
"step": 19000
|
| 5483 |
+
},
|
| 5484 |
+
{
|
| 5485 |
+
"epoch": 0.059417103381075226,
|
| 5486 |
+
"grad_norm": 41.5,
|
| 5487 |
+
"learning_rate": 0.0009997577752880041,
|
| 5488 |
+
"loss": 32.8132,
|
| 5489 |
+
"step": 19025
|
| 5490 |
+
},
|
| 5491 |
+
{
|
| 5492 |
+
"epoch": 0.05949518104649056,
|
| 5493 |
+
"grad_norm": 43.75,
|
| 5494 |
+
"learning_rate": 0.0009997537406276557,
|
| 5495 |
+
"loss": 32.9501,
|
| 5496 |
+
"step": 19050
|
| 5497 |
+
},
|
| 5498 |
+
{
|
| 5499 |
+
"epoch": 0.059573258711905906,
|
| 5500 |
+
"grad_norm": 45.25,
|
| 5501 |
+
"learning_rate": 0.0009997496726505228,
|
| 5502 |
+
"loss": 32.7061,
|
| 5503 |
+
"step": 19075
|
| 5504 |
+
},
|
| 5505 |
+
{
|
| 5506 |
+
"epoch": 0.05965133637732125,
|
| 5507 |
+
"grad_norm": 37.5,
|
| 5508 |
+
"learning_rate": 0.0009997455713568763,
|
| 5509 |
+
"loss": 32.7181,
|
| 5510 |
+
"step": 19100
|
| 5511 |
+
},
|
| 5512 |
+
{
|
| 5513 |
+
"epoch": 0.059729414042736594,
|
| 5514 |
+
"grad_norm": 41.0,
|
| 5515 |
+
"learning_rate": 0.00099974143674699,
|
| 5516 |
+
"loss": 32.554,
|
| 5517 |
+
"step": 19125
|
| 5518 |
+
},
|
| 5519 |
+
{
|
| 5520 |
+
"epoch": 0.05980749170815193,
|
| 5521 |
+
"grad_norm": 41.5,
|
| 5522 |
+
"learning_rate": 0.0009997372688211395,
|
| 5523 |
+
"loss": 32.7137,
|
| 5524 |
+
"step": 19150
|
| 5525 |
+
},
|
| 5526 |
+
{
|
| 5527 |
+
"epoch": 0.059885569373567274,
|
| 5528 |
+
"grad_norm": 45.0,
|
| 5529 |
+
"learning_rate": 0.0009997330675796023,
|
| 5530 |
+
"loss": 33.0025,
|
| 5531 |
+
"step": 19175
|
| 5532 |
+
},
|
| 5533 |
+
{
|
| 5534 |
+
"epoch": 0.05996364703898262,
|
| 5535 |
+
"grad_norm": 42.0,
|
| 5536 |
+
"learning_rate": 0.000999728833022659,
|
| 5537 |
+
"loss": 32.9643,
|
| 5538 |
+
"step": 19200
|
| 5539 |
+
},
|
| 5540 |
+
{
|
| 5541 |
+
"epoch": 0.06004172470439796,
|
| 5542 |
+
"grad_norm": 52.5,
|
| 5543 |
+
"learning_rate": 0.0009997245651505915,
|
| 5544 |
+
"loss": 32.8268,
|
| 5545 |
+
"step": 19225
|
| 5546 |
+
},
|
| 5547 |
+
{
|
| 5548 |
+
"epoch": 0.0601198023698133,
|
| 5549 |
+
"grad_norm": 43.0,
|
| 5550 |
+
"learning_rate": 0.0009997202639636844,
|
| 5551 |
+
"loss": 32.8,
|
| 5552 |
+
"step": 19250
|
| 5553 |
+
},
|
| 5554 |
+
{
|
| 5555 |
+
"epoch": 0.06019788003522864,
|
| 5556 |
+
"grad_norm": 56.5,
|
| 5557 |
+
"learning_rate": 0.0009997159294622246,
|
| 5558 |
+
"loss": 32.9133,
|
| 5559 |
+
"step": 19275
|
| 5560 |
+
},
|
| 5561 |
+
{
|
| 5562 |
+
"epoch": 0.060275957700643985,
|
| 5563 |
+
"grad_norm": 44.25,
|
| 5564 |
+
"learning_rate": 0.000999711561646501,
|
| 5565 |
+
"loss": 32.8573,
|
| 5566 |
+
"step": 19300
|
| 5567 |
+
},
|
| 5568 |
+
{
|
| 5569 |
+
"epoch": 0.06035403536605933,
|
| 5570 |
+
"grad_norm": 44.0,
|
| 5571 |
+
"learning_rate": 0.0009997071605168043,
|
| 5572 |
+
"loss": 32.7512,
|
| 5573 |
+
"step": 19325
|
| 5574 |
+
},
|
| 5575 |
+
{
|
| 5576 |
+
"epoch": 0.060432113031474666,
|
| 5577 |
+
"grad_norm": 36.5,
|
| 5578 |
+
"learning_rate": 0.000999702726073429,
|
| 5579 |
+
"loss": 32.9202,
|
| 5580 |
+
"step": 19350
|
| 5581 |
+
},
|
| 5582 |
+
{
|
| 5583 |
+
"epoch": 0.06051019069689001,
|
| 5584 |
+
"grad_norm": 40.0,
|
| 5585 |
+
"learning_rate": 0.0009996982583166695,
|
| 5586 |
+
"loss": 32.942,
|
| 5587 |
+
"step": 19375
|
| 5588 |
+
},
|
| 5589 |
+
{
|
| 5590 |
+
"epoch": 0.06058826836230535,
|
| 5591 |
+
"grad_norm": 39.0,
|
| 5592 |
+
"learning_rate": 0.0009996937572468246,
|
| 5593 |
+
"loss": 32.8775,
|
| 5594 |
+
"step": 19400
|
| 5595 |
+
},
|
| 5596 |
+
{
|
| 5597 |
+
"epoch": 0.0606663460277207,
|
| 5598 |
+
"grad_norm": 37.0,
|
| 5599 |
+
"learning_rate": 0.000999689222864194,
|
| 5600 |
+
"loss": 32.8532,
|
| 5601 |
+
"step": 19425
|
| 5602 |
+
},
|
| 5603 |
+
{
|
| 5604 |
+
"epoch": 0.06074442369313603,
|
| 5605 |
+
"grad_norm": 47.25,
|
| 5606 |
+
"learning_rate": 0.0009996846551690798,
|
| 5607 |
+
"loss": 32.9941,
|
| 5608 |
+
"step": 19450
|
| 5609 |
+
},
|
| 5610 |
+
{
|
| 5611 |
+
"epoch": 0.06082250135855138,
|
| 5612 |
+
"grad_norm": 38.0,
|
| 5613 |
+
"learning_rate": 0.0009996800541617868,
|
| 5614 |
+
"loss": 32.8616,
|
| 5615 |
+
"step": 19475
|
| 5616 |
+
},
|
| 5617 |
+
{
|
| 5618 |
+
"epoch": 0.06090057902396672,
|
| 5619 |
+
"grad_norm": 39.5,
|
| 5620 |
+
"learning_rate": 0.0009996754198426216,
|
| 5621 |
+
"loss": 32.9031,
|
| 5622 |
+
"step": 19500
|
| 5623 |
+
},
|
| 5624 |
+
{
|
| 5625 |
+
"epoch": 0.060978656689382064,
|
| 5626 |
+
"grad_norm": 44.5,
|
| 5627 |
+
"learning_rate": 0.0009996707522118933,
|
| 5628 |
+
"loss": 33.0028,
|
| 5629 |
+
"step": 19525
|
| 5630 |
+
},
|
| 5631 |
+
{
|
| 5632 |
+
"epoch": 0.0610567343547974,
|
| 5633 |
+
"grad_norm": 39.75,
|
| 5634 |
+
"learning_rate": 0.0009996660512699128,
|
| 5635 |
+
"loss": 32.8195,
|
| 5636 |
+
"step": 19550
|
| 5637 |
+
},
|
| 5638 |
+
{
|
| 5639 |
+
"epoch": 0.061134812020212745,
|
| 5640 |
+
"grad_norm": 40.75,
|
| 5641 |
+
"learning_rate": 0.0009996613170169936,
|
| 5642 |
+
"loss": 32.571,
|
| 5643 |
+
"step": 19575
|
| 5644 |
+
},
|
| 5645 |
+
{
|
| 5646 |
+
"epoch": 0.06121288968562809,
|
| 5647 |
+
"grad_norm": 36.75,
|
| 5648 |
+
"learning_rate": 0.0009996565494534517,
|
| 5649 |
+
"loss": 32.5517,
|
| 5650 |
+
"step": 19600
|
| 5651 |
+
},
|
| 5652 |
+
{
|
| 5653 |
+
"epoch": 0.06129096735104343,
|
| 5654 |
+
"grad_norm": 38.0,
|
| 5655 |
+
"learning_rate": 0.0009996517485796044,
|
| 5656 |
+
"loss": 32.5484,
|
| 5657 |
+
"step": 19625
|
| 5658 |
+
},
|
| 5659 |
+
{
|
| 5660 |
+
"epoch": 0.06136904501645877,
|
| 5661 |
+
"grad_norm": 41.75,
|
| 5662 |
+
"learning_rate": 0.000999646914395772,
|
| 5663 |
+
"loss": 32.5895,
|
| 5664 |
+
"step": 19650
|
| 5665 |
+
},
|
| 5666 |
+
{
|
| 5667 |
+
"epoch": 0.06144712268187411,
|
| 5668 |
+
"grad_norm": 42.0,
|
| 5669 |
+
"learning_rate": 0.0009996420469022766,
|
| 5670 |
+
"loss": 32.8765,
|
| 5671 |
+
"step": 19675
|
| 5672 |
+
},
|
| 5673 |
+
{
|
| 5674 |
+
"epoch": 0.061525200347289456,
|
| 5675 |
+
"grad_norm": 38.5,
|
| 5676 |
+
"learning_rate": 0.0009996371460994431,
|
| 5677 |
+
"loss": 32.8793,
|
| 5678 |
+
"step": 19700
|
| 5679 |
+
},
|
| 5680 |
+
{
|
| 5681 |
+
"epoch": 0.0616032780127048,
|
| 5682 |
+
"grad_norm": 40.25,
|
| 5683 |
+
"learning_rate": 0.0009996322119875977,
|
| 5684 |
+
"loss": 33.0708,
|
| 5685 |
+
"step": 19725
|
| 5686 |
+
},
|
| 5687 |
+
{
|
| 5688 |
+
"epoch": 0.06168135567812014,
|
| 5689 |
+
"grad_norm": 38.0,
|
| 5690 |
+
"learning_rate": 0.00099962724456707,
|
| 5691 |
+
"loss": 33.188,
|
| 5692 |
+
"step": 19750
|
| 5693 |
+
},
|
| 5694 |
+
{
|
| 5695 |
+
"epoch": 0.06175943334353548,
|
| 5696 |
+
"grad_norm": 49.0,
|
| 5697 |
+
"learning_rate": 0.0009996222438381904,
|
| 5698 |
+
"loss": 33.2918,
|
| 5699 |
+
"step": 19775
|
| 5700 |
+
},
|
| 5701 |
+
{
|
| 5702 |
+
"epoch": 0.061837511008950824,
|
| 5703 |
+
"grad_norm": 44.75,
|
| 5704 |
+
"learning_rate": 0.0009996172098012928,
|
| 5705 |
+
"loss": 33.4949,
|
| 5706 |
+
"step": 19800
|
| 5707 |
+
},
|
| 5708 |
+
{
|
| 5709 |
+
"epoch": 0.06191558867436617,
|
| 5710 |
+
"grad_norm": 43.25,
|
| 5711 |
+
"learning_rate": 0.0009996121424567126,
|
| 5712 |
+
"loss": 33.8741,
|
| 5713 |
+
"step": 19825
|
| 5714 |
+
},
|
| 5715 |
+
{
|
| 5716 |
+
"epoch": 0.061993666339781504,
|
| 5717 |
+
"grad_norm": 41.75,
|
| 5718 |
+
"learning_rate": 0.0009996070418047877,
|
| 5719 |
+
"loss": 33.6041,
|
| 5720 |
+
"step": 19850
|
| 5721 |
+
},
|
| 5722 |
+
{
|
| 5723 |
+
"epoch": 0.06207174400519685,
|
| 5724 |
+
"grad_norm": 40.25,
|
| 5725 |
+
"learning_rate": 0.000999601907845858,
|
| 5726 |
+
"loss": 33.6722,
|
| 5727 |
+
"step": 19875
|
| 5728 |
+
},
|
| 5729 |
+
{
|
| 5730 |
+
"epoch": 0.06214982167061219,
|
| 5731 |
+
"grad_norm": 40.5,
|
| 5732 |
+
"learning_rate": 0.000999596740580266,
|
| 5733 |
+
"loss": 33.484,
|
| 5734 |
+
"step": 19900
|
| 5735 |
+
},
|
| 5736 |
+
{
|
| 5737 |
+
"epoch": 0.062227899336027535,
|
| 5738 |
+
"grad_norm": 46.25,
|
| 5739 |
+
"learning_rate": 0.000999591540008356,
|
| 5740 |
+
"loss": 33.7352,
|
| 5741 |
+
"step": 19925
|
| 5742 |
+
},
|
| 5743 |
+
{
|
| 5744 |
+
"epoch": 0.06230597700144287,
|
| 5745 |
+
"grad_norm": 48.5,
|
| 5746 |
+
"learning_rate": 0.0009995863061304747,
|
| 5747 |
+
"loss": 33.9541,
|
| 5748 |
+
"step": 19950
|
| 5749 |
+
},
|
| 5750 |
+
{
|
| 5751 |
+
"epoch": 0.062384054666858216,
|
| 5752 |
+
"grad_norm": 44.0,
|
| 5753 |
+
"learning_rate": 0.0009995810389469711,
|
| 5754 |
+
"loss": 34.2383,
|
| 5755 |
+
"step": 19975
|
| 5756 |
+
},
|
| 5757 |
+
{
|
| 5758 |
+
"epoch": 0.06246213233227356,
|
| 5759 |
+
"grad_norm": 40.75,
|
| 5760 |
+
"learning_rate": 0.0009995757384581964,
|
| 5761 |
+
"loss": 33.8251,
|
| 5762 |
+
"step": 20000
|
| 5763 |
+
},
|
| 5764 |
+
{
|
| 5765 |
+
"epoch": 0.06246213233227356,
|
| 5766 |
+
"eval_loss": 34.19303512573242,
|
| 5767 |
+
"eval_runtime": 102.3811,
|
| 5768 |
+
"eval_samples_per_second": 50.82,
|
| 5769 |
+
"eval_steps_per_second": 3.184,
|
| 5770 |
+
"step": 20000
|
| 5771 |
}
|
| 5772 |
],
|
| 5773 |
"logging_steps": 25,
|
|
|
|
| 5787 |
"attributes": {}
|
| 5788 |
}
|
| 5789 |
},
|
| 5790 |
+
"total_flos": 2.53630733446493e+18,
|
| 5791 |
"train_batch_size": 8,
|
| 5792 |
"trial_name": null,
|
| 5793 |
"trial_params": null
|