Training in progress, step 117000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ccf8d1ee3da4942ba95f7a3a54578d6c16809257e74ad1be0b26812641e3056
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f1a7487954ffb44d1bab57c681b14f7a5680ded0c52a6c8bb015865beff7ed1
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48498b576bbabf1971bbdc1b63e18da5e5d6ff6ee2d2893d269ddf346414745c
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f7ac3b8ebf1c0d4bfd4f038411c119a54a5a538a834ebe005f085cdf984be31
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -20656,11 +20656,189 @@
|
|
| 20656 |
"eval_steps_per_second": 15.098,
|
| 20657 |
"num_input_tokens_seen": 60807636160,
|
| 20658 |
"step": 116000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20659 |
}
|
| 20660 |
],
|
| 20661 |
"logging_steps": 50,
|
| 20662 |
"max_steps": 140000,
|
| 20663 |
-
"num_input_tokens_seen":
|
| 20664 |
"num_train_epochs": 2,
|
| 20665 |
"save_steps": 1000,
|
| 20666 |
"stateful_callbacks": {
|
|
@@ -20675,7 +20853,7 @@
|
|
| 20675 |
"attributes": {}
|
| 20676 |
}
|
| 20677 |
},
|
| 20678 |
-
"total_flos": 1.
|
| 20679 |
"train_batch_size": 32,
|
| 20680 |
"trial_name": null,
|
| 20681 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1161882728931396,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 117000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 20656 |
"eval_steps_per_second": 15.098,
|
| 20657 |
"num_input_tokens_seen": 60807636160,
|
| 20658 |
"step": 116000
|
| 20659 |
+
},
|
| 20660 |
+
{
|
| 20661 |
+
"epoch": 1.1071252250855623,
|
| 20662 |
+
"grad_norm": 0.13944968581199646,
|
| 20663 |
+
"learning_rate": 0.0009492603104988907,
|
| 20664 |
+
"loss": 2.1028,
|
| 20665 |
+
"num_input_tokens_seen": 60833850560,
|
| 20666 |
+
"step": 116050
|
| 20667 |
+
},
|
| 20668 |
+
{
|
| 20669 |
+
"epoch": 1.1076022276017505,
|
| 20670 |
+
"grad_norm": 0.14454355835914612,
|
| 20671 |
+
"learning_rate": 0.0009480220479843627,
|
| 20672 |
+
"loss": 2.0995,
|
| 20673 |
+
"num_input_tokens_seen": 60860064224,
|
| 20674 |
+
"step": 116100
|
| 20675 |
+
},
|
| 20676 |
+
{
|
| 20677 |
+
"epoch": 1.108079230117939,
|
| 20678 |
+
"grad_norm": 0.1737418919801712,
|
| 20679 |
+
"learning_rate": 0.0009467696853780625,
|
| 20680 |
+
"loss": 2.0841,
|
| 20681 |
+
"num_input_tokens_seen": 60886278080,
|
| 20682 |
+
"step": 116150
|
| 20683 |
+
},
|
| 20684 |
+
{
|
| 20685 |
+
"epoch": 1.1085562326341272,
|
| 20686 |
+
"grad_norm": 0.1442703902721405,
|
| 20687 |
+
"learning_rate": 0.0009455032620941839,
|
| 20688 |
+
"loss": 2.0847,
|
| 20689 |
+
"num_input_tokens_seen": 60912488608,
|
| 20690 |
+
"step": 116200
|
| 20691 |
+
},
|
| 20692 |
+
{
|
| 20693 |
+
"epoch": 1.1090332351503154,
|
| 20694 |
+
"grad_norm": 0.14151588082313538,
|
| 20695 |
+
"learning_rate": 0.0009442228179894363,
|
| 20696 |
+
"loss": 2.0939,
|
| 20697 |
+
"num_input_tokens_seen": 60938699264,
|
| 20698 |
+
"step": 116250
|
| 20699 |
+
},
|
| 20700 |
+
{
|
| 20701 |
+
"epoch": 1.1095102376665036,
|
| 20702 |
+
"grad_norm": 0.12823954224586487,
|
| 20703 |
+
"learning_rate": 0.00094292839336179,
|
| 20704 |
+
"loss": 2.0911,
|
| 20705 |
+
"num_input_tokens_seen": 60964913664,
|
| 20706 |
+
"step": 116300
|
| 20707 |
+
},
|
| 20708 |
+
{
|
| 20709 |
+
"epoch": 1.109987240182692,
|
| 20710 |
+
"grad_norm": 0.1551038920879364,
|
| 20711 |
+
"learning_rate": 0.0009416200289492091,
|
| 20712 |
+
"loss": 2.0905,
|
| 20713 |
+
"num_input_tokens_seen": 60991126176,
|
| 20714 |
+
"step": 116350
|
| 20715 |
+
},
|
| 20716 |
+
{
|
| 20717 |
+
"epoch": 1.1104642426988802,
|
| 20718 |
+
"grad_norm": 0.14844666421413422,
|
| 20719 |
+
"learning_rate": 0.000940297765928369,
|
| 20720 |
+
"loss": 2.0853,
|
| 20721 |
+
"num_input_tokens_seen": 61017336640,
|
| 20722 |
+
"step": 116400
|
| 20723 |
+
},
|
| 20724 |
+
{
|
| 20725 |
+
"epoch": 1.1109412452150684,
|
| 20726 |
+
"grad_norm": 0.14786940813064575,
|
| 20727 |
+
"learning_rate": 0.0009389616459133597,
|
| 20728 |
+
"loss": 2.0948,
|
| 20729 |
+
"num_input_tokens_seen": 61043543488,
|
| 20730 |
+
"step": 116450
|
| 20731 |
+
},
|
| 20732 |
+
{
|
| 20733 |
+
"epoch": 1.1114182477312569,
|
| 20734 |
+
"grad_norm": 0.1404752880334854,
|
| 20735 |
+
"learning_rate": 0.0009376117109543769,
|
| 20736 |
+
"loss": 2.0889,
|
| 20737 |
+
"num_input_tokens_seen": 61069752768,
|
| 20738 |
+
"step": 116500
|
| 20739 |
+
},
|
| 20740 |
+
{
|
| 20741 |
+
"epoch": 1.1114182477312569,
|
| 20742 |
+
"eval_loss": 2.007530450820923,
|
| 20743 |
+
"eval_runtime": 83.3145,
|
| 20744 |
+
"eval_samples_per_second": 60.014,
|
| 20745 |
+
"eval_steps_per_second": 15.003,
|
| 20746 |
+
"num_input_tokens_seen": 61069752768,
|
| 20747 |
+
"step": 116500
|
| 20748 |
+
},
|
| 20749 |
+
{
|
| 20750 |
+
"epoch": 1.111895250247445,
|
| 20751 |
+
"grad_norm": 0.14887551963329315,
|
| 20752 |
+
"learning_rate": 0.0009362480035363986,
|
| 20753 |
+
"loss": 2.0906,
|
| 20754 |
+
"num_input_tokens_seen": 61095967168,
|
| 20755 |
+
"step": 116550
|
| 20756 |
+
},
|
| 20757 |
+
{
|
| 20758 |
+
"epoch": 1.1123722527636333,
|
| 20759 |
+
"grad_norm": 0.1436939537525177,
|
| 20760 |
+
"learning_rate": 0.0009348705665778478,
|
| 20761 |
+
"loss": 2.0857,
|
| 20762 |
+
"num_input_tokens_seen": 61122178400,
|
| 20763 |
+
"step": 116600
|
| 20764 |
+
},
|
| 20765 |
+
{
|
| 20766 |
+
"epoch": 1.1128492552798217,
|
| 20767 |
+
"grad_norm": 0.15015645325183868,
|
| 20768 |
+
"learning_rate": 0.0009334794434292415,
|
| 20769 |
+
"loss": 2.0877,
|
| 20770 |
+
"num_input_tokens_seen": 61148383936,
|
| 20771 |
+
"step": 116650
|
| 20772 |
+
},
|
| 20773 |
+
{
|
| 20774 |
+
"epoch": 1.11332625779601,
|
| 20775 |
+
"grad_norm": 0.15639320015907288,
|
| 20776 |
+
"learning_rate": 0.0009320746778718274,
|
| 20777 |
+
"loss": 2.082,
|
| 20778 |
+
"num_input_tokens_seen": 61174590560,
|
| 20779 |
+
"step": 116700
|
| 20780 |
+
},
|
| 20781 |
+
{
|
| 20782 |
+
"epoch": 1.1138032603121981,
|
| 20783 |
+
"grad_norm": 0.1376616209745407,
|
| 20784 |
+
"learning_rate": 0.0009306563141162046,
|
| 20785 |
+
"loss": 2.0893,
|
| 20786 |
+
"num_input_tokens_seen": 61200799104,
|
| 20787 |
+
"step": 116750
|
| 20788 |
+
},
|
| 20789 |
+
{
|
| 20790 |
+
"epoch": 1.1142802628283863,
|
| 20791 |
+
"grad_norm": 0.13897264003753662,
|
| 20792 |
+
"learning_rate": 0.000929224396800933,
|
| 20793 |
+
"loss": 2.0885,
|
| 20794 |
+
"num_input_tokens_seen": 61227004960,
|
| 20795 |
+
"step": 116800
|
| 20796 |
+
},
|
| 20797 |
+
{
|
| 20798 |
+
"epoch": 1.1147572653445748,
|
| 20799 |
+
"grad_norm": 0.16240862011909485,
|
| 20800 |
+
"learning_rate": 0.0009277789709911291,
|
| 20801 |
+
"loss": 2.0772,
|
| 20802 |
+
"num_input_tokens_seen": 61253214976,
|
| 20803 |
+
"step": 116850
|
| 20804 |
+
},
|
| 20805 |
+
{
|
| 20806 |
+
"epoch": 1.115234267860763,
|
| 20807 |
+
"grad_norm": 0.13620969653129578,
|
| 20808 |
+
"learning_rate": 0.0009263200821770461,
|
| 20809 |
+
"loss": 2.0815,
|
| 20810 |
+
"num_input_tokens_seen": 61279425344,
|
| 20811 |
+
"step": 116900
|
| 20812 |
+
},
|
| 20813 |
+
{
|
| 20814 |
+
"epoch": 1.1157112703769512,
|
| 20815 |
+
"grad_norm": 0.13625779747962952,
|
| 20816 |
+
"learning_rate": 0.0009248477762726437,
|
| 20817 |
+
"loss": 2.0834,
|
| 20818 |
+
"num_input_tokens_seen": 61305623936,
|
| 20819 |
+
"step": 116950
|
| 20820 |
+
},
|
| 20821 |
+
{
|
| 20822 |
+
"epoch": 1.1161882728931396,
|
| 20823 |
+
"grad_norm": 0.1379876434803009,
|
| 20824 |
+
"learning_rate": 0.0009233620996141421,
|
| 20825 |
+
"loss": 2.0879,
|
| 20826 |
+
"num_input_tokens_seen": 61331831488,
|
| 20827 |
+
"step": 117000
|
| 20828 |
+
},
|
| 20829 |
+
{
|
| 20830 |
+
"epoch": 1.1161882728931396,
|
| 20831 |
+
"eval_loss": 2.0054421424865723,
|
| 20832 |
+
"eval_runtime": 82.7611,
|
| 20833 |
+
"eval_samples_per_second": 60.415,
|
| 20834 |
+
"eval_steps_per_second": 15.104,
|
| 20835 |
+
"num_input_tokens_seen": 61331831488,
|
| 20836 |
+
"step": 117000
|
| 20837 |
}
|
| 20838 |
],
|
| 20839 |
"logging_steps": 50,
|
| 20840 |
"max_steps": 140000,
|
| 20841 |
+
"num_input_tokens_seen": 61331831488,
|
| 20842 |
"num_train_epochs": 2,
|
| 20843 |
"save_steps": 1000,
|
| 20844 |
"stateful_callbacks": {
|
|
|
|
| 20853 |
"attributes": {}
|
| 20854 |
}
|
| 20855 |
},
|
| 20856 |
+
"total_flos": 1.0854615480769659e+20,
|
| 20857 |
"train_batch_size": 32,
|
| 20858 |
"trial_name": null,
|
| 20859 |
"trial_params": null
|