Training in progress, step 134000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d56ac5cac24a22412473f2135127ddabb38b319ea83b674e986a42239b250e9
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4d6a881e9f26105deee08c944e754ddbf4c77f455ab89089e93e0141d4bbc5a
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24b5f8b02f183c01b91dfb927bcee2fd08e29422009a0f8c863f42c2374d464d
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f96155b98d632c68f19e59b549aa9343e95b0d1b8978f18da42e6a70e5498d0e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -23682,11 +23682,189 @@
|
|
| 23682 |
"eval_steps_per_second": 15.141,
|
| 23683 |
"num_input_tokens_seen": 69719047840,
|
| 23684 |
"step": 133000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23685 |
}
|
| 23686 |
],
|
| 23687 |
"logging_steps": 50,
|
| 23688 |
"max_steps": 140000,
|
| 23689 |
-
"num_input_tokens_seen":
|
| 23690 |
"num_train_epochs": 2,
|
| 23691 |
"save_steps": 1000,
|
| 23692 |
"stateful_callbacks": {
|
|
@@ -23701,7 +23879,7 @@
|
|
| 23701 |
"attributes": {}
|
| 23702 |
}
|
| 23703 |
},
|
| 23704 |
-
"total_flos": 1.
|
| 23705 |
"train_batch_size": 32,
|
| 23706 |
"trial_name": null,
|
| 23707 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2783691283971523,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 134000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 23682 |
"eval_steps_per_second": 15.141,
|
| 23683 |
"num_input_tokens_seen": 69719047840,
|
| 23684 |
"step": 133000
|
| 23685 |
+
},
|
| 23686 |
+
{
|
| 23687 |
+
"epoch": 1.269306080589575,
|
| 23688 |
+
"grad_norm": 0.12358897924423218,
|
| 23689 |
+
"learning_rate": 0.00014446875342055988,
|
| 23690 |
+
"loss": 2.0342,
|
| 23691 |
+
"num_input_tokens_seen": 69745262240,
|
| 23692 |
+
"step": 133050
|
| 23693 |
+
},
|
| 23694 |
+
{
|
| 23695 |
+
"epoch": 1.2697830831057635,
|
| 23696 |
+
"grad_norm": 0.12031599134206772,
|
| 23697 |
+
"learning_rate": 0.00014250208666766236,
|
| 23698 |
+
"loss": 2.0402,
|
| 23699 |
+
"num_input_tokens_seen": 69771476640,
|
| 23700 |
+
"step": 133100
|
| 23701 |
+
},
|
| 23702 |
+
{
|
| 23703 |
+
"epoch": 1.2702600856219517,
|
| 23704 |
+
"grad_norm": 0.12011140584945679,
|
| 23705 |
+
"learning_rate": 0.00014054667104271496,
|
| 23706 |
+
"loss": 2.0358,
|
| 23707 |
+
"num_input_tokens_seen": 69797691040,
|
| 23708 |
+
"step": 133150
|
| 23709 |
+
},
|
| 23710 |
+
{
|
| 23711 |
+
"epoch": 1.27073708813814,
|
| 23712 |
+
"grad_norm": 0.12352379411458969,
|
| 23713 |
+
"learning_rate": 0.00013860256808630427,
|
| 23714 |
+
"loss": 2.043,
|
| 23715 |
+
"num_input_tokens_seen": 69823902816,
|
| 23716 |
+
"step": 133200
|
| 23717 |
+
},
|
| 23718 |
+
{
|
| 23719 |
+
"epoch": 1.271214090654328,
|
| 23720 |
+
"grad_norm": 0.1257781833410263,
|
| 23721 |
+
"learning_rate": 0.00013666983898298656,
|
| 23722 |
+
"loss": 2.0464,
|
| 23723 |
+
"num_input_tokens_seen": 69850112224,
|
| 23724 |
+
"step": 133250
|
| 23725 |
+
},
|
| 23726 |
+
{
|
| 23727 |
+
"epoch": 1.2716910931705165,
|
| 23728 |
+
"grad_norm": 0.12694838643074036,
|
| 23729 |
+
"learning_rate": 0.00013474854455936125,
|
| 23730 |
+
"loss": 2.0401,
|
| 23731 |
+
"num_input_tokens_seen": 69876325568,
|
| 23732 |
+
"step": 133300
|
| 23733 |
+
},
|
| 23734 |
+
{
|
| 23735 |
+
"epoch": 1.2721680956867047,
|
| 23736 |
+
"grad_norm": 0.12634819746017456,
|
| 23737 |
+
"learning_rate": 0.00013283874528215734,
|
| 23738 |
+
"loss": 2.0339,
|
| 23739 |
+
"num_input_tokens_seen": 69902536928,
|
| 23740 |
+
"step": 133350
|
| 23741 |
+
},
|
| 23742 |
+
{
|
| 23743 |
+
"epoch": 1.272645098202893,
|
| 23744 |
+
"grad_norm": 0.12307710945606232,
|
| 23745 |
+
"learning_rate": 0.00013094050125632973,
|
| 23746 |
+
"loss": 2.0277,
|
| 23747 |
+
"num_input_tokens_seen": 69928748288,
|
| 23748 |
+
"step": 133400
|
| 23749 |
+
},
|
| 23750 |
+
{
|
| 23751 |
+
"epoch": 1.2731221007190814,
|
| 23752 |
+
"grad_norm": 0.12187953293323517,
|
| 23753 |
+
"learning_rate": 0.00012905387222316822,
|
| 23754 |
+
"loss": 2.0402,
|
| 23755 |
+
"num_input_tokens_seen": 69954953888,
|
| 23756 |
+
"step": 133450
|
| 23757 |
+
},
|
| 23758 |
+
{
|
| 23759 |
+
"epoch": 1.2735991032352696,
|
| 23760 |
+
"grad_norm": 0.12032655626535416,
|
| 23761 |
+
"learning_rate": 0.0001271789175584172,
|
| 23762 |
+
"loss": 2.0419,
|
| 23763 |
+
"num_input_tokens_seen": 69981165632,
|
| 23764 |
+
"step": 133500
|
| 23765 |
+
},
|
| 23766 |
+
{
|
| 23767 |
+
"epoch": 1.2735991032352696,
|
| 23768 |
+
"eval_loss": 1.9568681716918945,
|
| 23769 |
+
"eval_runtime": 82.7406,
|
| 23770 |
+
"eval_samples_per_second": 60.43,
|
| 23771 |
+
"eval_steps_per_second": 15.107,
|
| 23772 |
+
"num_input_tokens_seen": 69981165632,
|
| 23773 |
+
"step": 133500
|
| 23774 |
+
},
|
| 23775 |
+
{
|
| 23776 |
+
"epoch": 1.2740761057514578,
|
| 23777 |
+
"grad_norm": 0.12817110121250153,
|
| 23778 |
+
"learning_rate": 0.00012531569627040635,
|
| 23779 |
+
"loss": 2.034,
|
| 23780 |
+
"num_input_tokens_seen": 70007368800,
|
| 23781 |
+
"step": 133550
|
| 23782 |
+
},
|
| 23783 |
+
{
|
| 23784 |
+
"epoch": 1.274553108267646,
|
| 23785 |
+
"grad_norm": 0.13095012307167053,
|
| 23786 |
+
"learning_rate": 0.00012346426699819457,
|
| 23787 |
+
"loss": 2.0346,
|
| 23788 |
+
"num_input_tokens_seen": 70033578048,
|
| 23789 |
+
"step": 133600
|
| 23790 |
+
},
|
| 23791 |
+
{
|
| 23792 |
+
"epoch": 1.2750301107838344,
|
| 23793 |
+
"grad_norm": 0.12582357227802277,
|
| 23794 |
+
"learning_rate": 0.00012162468800972342,
|
| 23795 |
+
"loss": 2.0398,
|
| 23796 |
+
"num_input_tokens_seen": 70059792448,
|
| 23797 |
+
"step": 133650
|
| 23798 |
+
},
|
| 23799 |
+
{
|
| 23800 |
+
"epoch": 1.2755071133000226,
|
| 23801 |
+
"grad_norm": 0.11612017452716827,
|
| 23802 |
+
"learning_rate": 0.00011979701719998454,
|
| 23803 |
+
"loss": 2.0341,
|
| 23804 |
+
"num_input_tokens_seen": 70086003648,
|
| 23805 |
+
"step": 133700
|
| 23806 |
+
},
|
| 23807 |
+
{
|
| 23808 |
+
"epoch": 1.2759841158162109,
|
| 23809 |
+
"grad_norm": 0.12256049364805222,
|
| 23810 |
+
"learning_rate": 0.00011798131208919626,
|
| 23811 |
+
"loss": 2.029,
|
| 23812 |
+
"num_input_tokens_seen": 70112204096,
|
| 23813 |
+
"step": 133750
|
| 23814 |
+
},
|
| 23815 |
+
{
|
| 23816 |
+
"epoch": 1.2764611183323993,
|
| 23817 |
+
"grad_norm": 0.11747635900974274,
|
| 23818 |
+
"learning_rate": 0.00011617762982099444,
|
| 23819 |
+
"loss": 2.0355,
|
| 23820 |
+
"num_input_tokens_seen": 70138411104,
|
| 23821 |
+
"step": 133800
|
| 23822 |
+
},
|
| 23823 |
+
{
|
| 23824 |
+
"epoch": 1.2769381208485875,
|
| 23825 |
+
"grad_norm": 0.12225272506475449,
|
| 23826 |
+
"learning_rate": 0.00011438602716063329,
|
| 23827 |
+
"loss": 2.042,
|
| 23828 |
+
"num_input_tokens_seen": 70164623328,
|
| 23829 |
+
"step": 133850
|
| 23830 |
+
},
|
| 23831 |
+
{
|
| 23832 |
+
"epoch": 1.2774151233647757,
|
| 23833 |
+
"grad_norm": 0.1293225735425949,
|
| 23834 |
+
"learning_rate": 0.00011260656049319957,
|
| 23835 |
+
"loss": 2.0367,
|
| 23836 |
+
"num_input_tokens_seen": 70190833888,
|
| 23837 |
+
"step": 133900
|
| 23838 |
+
},
|
| 23839 |
+
{
|
| 23840 |
+
"epoch": 1.277892125880964,
|
| 23841 |
+
"grad_norm": 0.12261593341827393,
|
| 23842 |
+
"learning_rate": 0.0001108392858218371,
|
| 23843 |
+
"loss": 2.0444,
|
| 23844 |
+
"num_input_tokens_seen": 70217043648,
|
| 23845 |
+
"step": 133950
|
| 23846 |
+
},
|
| 23847 |
+
{
|
| 23848 |
+
"epoch": 1.2783691283971523,
|
| 23849 |
+
"grad_norm": 0.11957214772701263,
|
| 23850 |
+
"learning_rate": 0.0001090842587659851,
|
| 23851 |
+
"loss": 2.0345,
|
| 23852 |
+
"num_input_tokens_seen": 70243253472,
|
| 23853 |
+
"step": 134000
|
| 23854 |
+
},
|
| 23855 |
+
{
|
| 23856 |
+
"epoch": 1.2783691283971523,
|
| 23857 |
+
"eval_loss": 1.955412745475769,
|
| 23858 |
+
"eval_runtime": 82.5981,
|
| 23859 |
+
"eval_samples_per_second": 60.534,
|
| 23860 |
+
"eval_steps_per_second": 15.134,
|
| 23861 |
+
"num_input_tokens_seen": 70243253472,
|
| 23862 |
+
"step": 134000
|
| 23863 |
}
|
| 23864 |
],
|
| 23865 |
"logging_steps": 50,
|
| 23866 |
"max_steps": 140000,
|
| 23867 |
+
"num_input_tokens_seen": 70243253472,
|
| 23868 |
"num_train_epochs": 2,
|
| 23869 |
"save_steps": 1000,
|
| 23870 |
"stateful_callbacks": {
|
|
|
|
| 23879 |
"attributes": {}
|
| 23880 |
}
|
| 23881 |
},
|
| 23882 |
+
"total_flos": 1.243177462760067e+20,
|
| 23883 |
"train_batch_size": 32,
|
| 23884 |
"trial_name": null,
|
| 23885 |
"trial_params": null
|