Training in progress, step 139000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6173b4bc562c2e11366705c8c76e7d31698b3a60389b9a754914d9b8842cf90f
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3048a59b63da999ae8fc02b473b5d2a50c2be60b98f1004a6c79f0035ac60f1
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ef3d8a81eedcecdd331f8207cd63df8c3721e9e06bbee141ce7de5f7de358d9
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0f0628bbbac738b6a9aa97ca88652280d641a00de879a3f6b83636f7c99513d
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24572,11 +24572,189 @@
|
|
| 24572 |
"eval_steps_per_second": 15.219,
|
| 24573 |
"num_input_tokens_seen": 72340003200,
|
| 24574 |
"step": 138000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24575 |
}
|
| 24576 |
],
|
| 24577 |
"logging_steps": 50,
|
| 24578 |
"max_steps": 140000,
|
| 24579 |
-
"num_input_tokens_seen":
|
| 24580 |
"num_train_epochs": 2,
|
| 24581 |
"save_steps": 1000,
|
| 24582 |
"stateful_callbacks": {
|
|
@@ -24591,7 +24769,7 @@
|
|
| 24591 |
"attributes": {}
|
| 24592 |
}
|
| 24593 |
},
|
| 24594 |
-
"total_flos": 1.
|
| 24595 |
"train_batch_size": 32,
|
| 24596 |
"trial_name": null,
|
| 24597 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.3260693800159795,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 139000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24572 |
"eval_steps_per_second": 15.219,
|
| 24573 |
"num_input_tokens_seen": 72340003200,
|
| 24574 |
"step": 138000
|
| 24575 |
+
},
|
| 24576 |
+
{
|
| 24577 |
+
"epoch": 1.3170063322084025,
|
| 24578 |
+
"grad_norm": 0.11734651029109955,
|
| 24579 |
+
"learning_rate": 1.191954812408308e-05,
|
| 24580 |
+
"loss": 2.0241,
|
| 24581 |
+
"num_input_tokens_seen": 72366217600,
|
| 24582 |
+
"step": 138050
|
| 24583 |
+
},
|
| 24584 |
+
{
|
| 24585 |
+
"epoch": 1.3174833347245907,
|
| 24586 |
+
"grad_norm": 0.11315104365348816,
|
| 24587 |
+
"learning_rate": 1.1318413143740436e-05,
|
| 24588 |
+
"loss": 2.0195,
|
| 24589 |
+
"num_input_tokens_seen": 72392425632,
|
| 24590 |
+
"step": 138100
|
| 24591 |
+
},
|
| 24592 |
+
{
|
| 24593 |
+
"epoch": 1.3179603372407789,
|
| 24594 |
+
"grad_norm": 0.11212780326604843,
|
| 24595 |
+
"learning_rate": 1.0732657886902309e-05,
|
| 24596 |
+
"loss": 2.0379,
|
| 24597 |
+
"num_input_tokens_seen": 72418637536,
|
| 24598 |
+
"step": 138150
|
| 24599 |
+
},
|
| 24600 |
+
{
|
| 24601 |
+
"epoch": 1.3184373397569673,
|
| 24602 |
+
"grad_norm": 0.11390957236289978,
|
| 24603 |
+
"learning_rate": 1.0162300788382261e-05,
|
| 24604 |
+
"loss": 2.0245,
|
| 24605 |
+
"num_input_tokens_seen": 72444850752,
|
| 24606 |
+
"step": 138200
|
| 24607 |
+
},
|
| 24608 |
+
{
|
| 24609 |
+
"epoch": 1.3189143422731555,
|
| 24610 |
+
"grad_norm": 0.11521212011575699,
|
| 24611 |
+
"learning_rate": 9.607359798384786e-06,
|
| 24612 |
+
"loss": 2.0313,
|
| 24613 |
+
"num_input_tokens_seen": 72471060032,
|
| 24614 |
+
"step": 138250
|
| 24615 |
+
},
|
| 24616 |
+
{
|
| 24617 |
+
"epoch": 1.3193913447893437,
|
| 24618 |
+
"grad_norm": 0.11375854164361954,
|
| 24619 |
+
"learning_rate": 9.0678523819408e-06,
|
| 24620 |
+
"loss": 2.0313,
|
| 24621 |
+
"num_input_tokens_seen": 72497274432,
|
| 24622 |
+
"step": 138300
|
| 24623 |
+
},
|
| 24624 |
+
{
|
| 24625 |
+
"epoch": 1.319868347305532,
|
| 24626 |
+
"grad_norm": 0.11399056017398834,
|
| 24627 |
+
"learning_rate": 8.543795518357766e-06,
|
| 24628 |
+
"loss": 2.0256,
|
| 24629 |
+
"num_input_tokens_seen": 72523485952,
|
| 24630 |
+
"step": 138350
|
| 24631 |
+
},
|
| 24632 |
+
{
|
| 24633 |
+
"epoch": 1.3203453498217204,
|
| 24634 |
+
"grad_norm": 0.11128194630146027,
|
| 24635 |
+
"learning_rate": 8.035205700685167e-06,
|
| 24636 |
+
"loss": 2.0338,
|
| 24637 |
+
"num_input_tokens_seen": 72549700352,
|
| 24638 |
+
"step": 138400
|
| 24639 |
+
},
|
| 24640 |
+
{
|
| 24641 |
+
"epoch": 1.3208223523379086,
|
| 24642 |
+
"grad_norm": 0.11179857701063156,
|
| 24643 |
+
"learning_rate": 7.542098935195918e-06,
|
| 24644 |
+
"loss": 2.0362,
|
| 24645 |
+
"num_input_tokens_seen": 72575912992,
|
| 24646 |
+
"step": 138450
|
| 24647 |
+
},
|
| 24648 |
+
{
|
| 24649 |
+
"epoch": 1.3212993548540968,
|
| 24650 |
+
"grad_norm": 0.11500924825668335,
|
| 24651 |
+
"learning_rate": 7.064490740882057e-06,
|
| 24652 |
+
"loss": 2.0285,
|
| 24653 |
+
"num_input_tokens_seen": 72602127392,
|
| 24654 |
+
"step": 138500
|
| 24655 |
+
},
|
| 24656 |
+
{
|
| 24657 |
+
"epoch": 1.3212993548540968,
|
| 24658 |
+
"eval_loss": 1.951123833656311,
|
| 24659 |
+
"eval_runtime": 82.6672,
|
| 24660 |
+
"eval_samples_per_second": 60.484,
|
| 24661 |
+
"eval_steps_per_second": 15.121,
|
| 24662 |
+
"num_input_tokens_seen": 72602127392,
|
| 24663 |
+
"step": 138500
|
| 24664 |
+
},
|
| 24665 |
+
{
|
| 24666 |
+
"epoch": 1.3217763573702852,
|
| 24667 |
+
"grad_norm": 0.1176285520195961,
|
| 24668 |
+
"learning_rate": 6.602396148966794e-06,
|
| 24669 |
+
"loss": 2.0295,
|
| 24670 |
+
"num_input_tokens_seen": 72628340704,
|
| 24671 |
+
"step": 138550
|
| 24672 |
+
},
|
| 24673 |
+
{
|
| 24674 |
+
"epoch": 1.3222533598864734,
|
| 24675 |
+
"grad_norm": 0.11359469592571259,
|
| 24676 |
+
"learning_rate": 6.15582970243117e-06,
|
| 24677 |
+
"loss": 2.0206,
|
| 24678 |
+
"num_input_tokens_seen": 72654548704,
|
| 24679 |
+
"step": 138600
|
| 24680 |
+
},
|
| 24681 |
+
{
|
| 24682 |
+
"epoch": 1.3227303624026616,
|
| 24683 |
+
"grad_norm": 0.11230379343032837,
|
| 24684 |
+
"learning_rate": 5.72480545555637e-06,
|
| 24685 |
+
"loss": 2.0285,
|
| 24686 |
+
"num_input_tokens_seen": 72680760704,
|
| 24687 |
+
"step": 138650
|
| 24688 |
+
},
|
| 24689 |
+
{
|
| 24690 |
+
"epoch": 1.3232073649188498,
|
| 24691 |
+
"grad_norm": 0.11325126886367798,
|
| 24692 |
+
"learning_rate": 5.309336973481682e-06,
|
| 24693 |
+
"loss": 2.0316,
|
| 24694 |
+
"num_input_tokens_seen": 72706975104,
|
| 24695 |
+
"step": 138700
|
| 24696 |
+
},
|
| 24697 |
+
{
|
| 24698 |
+
"epoch": 1.3236843674350383,
|
| 24699 |
+
"grad_norm": 0.11530512571334839,
|
| 24700 |
+
"learning_rate": 4.909437331777178e-06,
|
| 24701 |
+
"loss": 2.0295,
|
| 24702 |
+
"num_input_tokens_seen": 72733189504,
|
| 24703 |
+
"step": 138750
|
| 24704 |
+
},
|
| 24705 |
+
{
|
| 24706 |
+
"epoch": 1.3241613699512265,
|
| 24707 |
+
"grad_norm": 0.11637042462825775,
|
| 24708 |
+
"learning_rate": 4.52511911603265e-06,
|
| 24709 |
+
"loss": 2.0358,
|
| 24710 |
+
"num_input_tokens_seen": 72759403904,
|
| 24711 |
+
"step": 138800
|
| 24712 |
+
},
|
| 24713 |
+
{
|
| 24714 |
+
"epoch": 1.324638372467415,
|
| 24715 |
+
"grad_norm": 0.11307495832443237,
|
| 24716 |
+
"learning_rate": 4.15639442146093e-06,
|
| 24717 |
+
"loss": 2.0256,
|
| 24718 |
+
"num_input_tokens_seen": 72785609280,
|
| 24719 |
+
"step": 138850
|
| 24720 |
+
},
|
| 24721 |
+
{
|
| 24722 |
+
"epoch": 1.325115374983603,
|
| 24723 |
+
"grad_norm": 0.11408944427967072,
|
| 24724 |
+
"learning_rate": 3.803274852517968e-06,
|
| 24725 |
+
"loss": 2.0432,
|
| 24726 |
+
"num_input_tokens_seen": 72811823680,
|
| 24727 |
+
"step": 138900
|
| 24728 |
+
},
|
| 24729 |
+
{
|
| 24730 |
+
"epoch": 1.3255923774997913,
|
| 24731 |
+
"grad_norm": 0.11304306238889694,
|
| 24732 |
+
"learning_rate": 3.4657715225368535e-06,
|
| 24733 |
+
"loss": 2.0342,
|
| 24734 |
+
"num_input_tokens_seen": 72838035008,
|
| 24735 |
+
"step": 138950
|
| 24736 |
+
},
|
| 24737 |
+
{
|
| 24738 |
+
"epoch": 1.3260693800159795,
|
| 24739 |
+
"grad_norm": 0.11682960391044617,
|
| 24740 |
+
"learning_rate": 3.143895053378698e-06,
|
| 24741 |
+
"loss": 2.0353,
|
| 24742 |
+
"num_input_tokens_seen": 72864248896,
|
| 24743 |
+
"step": 139000
|
| 24744 |
+
},
|
| 24745 |
+
{
|
| 24746 |
+
"epoch": 1.3260693800159795,
|
| 24747 |
+
"eval_loss": 1.9510550498962402,
|
| 24748 |
+
"eval_runtime": 82.5623,
|
| 24749 |
+
"eval_samples_per_second": 60.56,
|
| 24750 |
+
"eval_steps_per_second": 15.14,
|
| 24751 |
+
"num_input_tokens_seen": 72864248896,
|
| 24752 |
+
"step": 139000
|
| 24753 |
}
|
| 24754 |
],
|
| 24755 |
"logging_steps": 50,
|
| 24756 |
"max_steps": 140000,
|
| 24757 |
+
"num_input_tokens_seen": 72864248896,
|
| 24758 |
"num_train_epochs": 2,
|
| 24759 |
"save_steps": 1000,
|
| 24760 |
"stateful_callbacks": {
|
|
|
|
| 24769 |
"attributes": {}
|
| 24770 |
}
|
| 24771 |
},
|
| 24772 |
+
"total_flos": 1.2895643010692137e+20,
|
| 24773 |
"train_batch_size": 32,
|
| 24774 |
"trial_name": null,
|
| 24775 |
"trial_params": null
|