Training in progress, step 123000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cef5b67a6a8ef1b7b03d42987cf14119de3a2a743fc8652bcc28538e2c6f502f
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03925e5e99d9cbfffe2f6300cf8385c7fca65c8ed5a96f6e0b64b1da83665e80
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:057702d02e4981608a0b19960ab61ff20cc438831297a4986309cdb565b1c450
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e98c7489b04ae19323aa5fe9264a9e2511b478d8f623351ee3b05babc6a227f
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -21724,11 +21724,189 @@
|
|
| 21724 |
"eval_steps_per_second": 15.195,
|
| 21725 |
"num_input_tokens_seen": 63952872768,
|
| 21726 |
"step": 122000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21727 |
}
|
| 21728 |
],
|
| 21729 |
"logging_steps": 50,
|
| 21730 |
"max_steps": 140000,
|
| 21731 |
-
"num_input_tokens_seen":
|
| 21732 |
"num_train_epochs": 2,
|
| 21733 |
"save_steps": 1000,
|
| 21734 |
"stateful_callbacks": {
|
|
@@ -21743,7 +21921,7 @@
|
|
| 21743 |
"attributes": {}
|
| 21744 |
}
|
| 21745 |
},
|
| 21746 |
-
"total_flos": 1.
|
| 21747 |
"train_batch_size": 32,
|
| 21748 |
"trial_name": null,
|
| 21749 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1734285748357323,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 123000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 21724 |
"eval_steps_per_second": 15.195,
|
| 21725 |
"num_input_tokens_seen": 63952872768,
|
| 21726 |
"step": 122000
|
| 21727 |
+
},
|
| 21728 |
+
{
|
| 21729 |
+
"epoch": 1.164365527028155,
|
| 21730 |
+
"grad_norm": 0.13863904774188995,
|
| 21731 |
+
"learning_rate": 0.0007144112572668733,
|
| 21732 |
+
"loss": 2.0703,
|
| 21733 |
+
"num_input_tokens_seen": 63979084224,
|
| 21734 |
+
"step": 122050
|
| 21735 |
+
},
|
| 21736 |
+
{
|
| 21737 |
+
"epoch": 1.1648425295443434,
|
| 21738 |
+
"grad_norm": 0.1426379680633545,
|
| 21739 |
+
"learning_rate": 0.0007118738970516943,
|
| 21740 |
+
"loss": 2.0766,
|
| 21741 |
+
"num_input_tokens_seen": 64005286944,
|
| 21742 |
+
"step": 122100
|
| 21743 |
+
},
|
| 21744 |
+
{
|
| 21745 |
+
"epoch": 1.1653195320605316,
|
| 21746 |
+
"grad_norm": 0.13977181911468506,
|
| 21747 |
+
"learning_rate": 0.0007093298687687141,
|
| 21748 |
+
"loss": 2.0692,
|
| 21749 |
+
"num_input_tokens_seen": 64031487744,
|
| 21750 |
+
"step": 122150
|
| 21751 |
+
},
|
| 21752 |
+
{
|
| 21753 |
+
"epoch": 1.1657965345767198,
|
| 21754 |
+
"grad_norm": 0.1425238400697708,
|
| 21755 |
+
"learning_rate": 0.0007067792524832604,
|
| 21756 |
+
"loss": 2.0662,
|
| 21757 |
+
"num_input_tokens_seen": 64057695552,
|
| 21758 |
+
"step": 122200
|
| 21759 |
+
},
|
| 21760 |
+
{
|
| 21761 |
+
"epoch": 1.1662735370929083,
|
| 21762 |
+
"grad_norm": 0.15061677992343903,
|
| 21763 |
+
"learning_rate": 0.0007042221284679982,
|
| 21764 |
+
"loss": 2.0781,
|
| 21765 |
+
"num_input_tokens_seen": 64083893664,
|
| 21766 |
+
"step": 122250
|
| 21767 |
+
},
|
| 21768 |
+
{
|
| 21769 |
+
"epoch": 1.1667505396090965,
|
| 21770 |
+
"grad_norm": 0.12374892085790634,
|
| 21771 |
+
"learning_rate": 0.0007016585772004026,
|
| 21772 |
+
"loss": 2.0745,
|
| 21773 |
+
"num_input_tokens_seen": 64110107392,
|
| 21774 |
+
"step": 122300
|
| 21775 |
+
},
|
| 21776 |
+
{
|
| 21777 |
+
"epoch": 1.1672275421252847,
|
| 21778 |
+
"grad_norm": 0.1427278071641922,
|
| 21779 |
+
"learning_rate": 0.0006990886793602267,
|
| 21780 |
+
"loss": 2.0861,
|
| 21781 |
+
"num_input_tokens_seen": 64136321792,
|
| 21782 |
+
"step": 122350
|
| 21783 |
+
},
|
| 21784 |
+
{
|
| 21785 |
+
"epoch": 1.1677045446414729,
|
| 21786 |
+
"grad_norm": 0.15141050517559052,
|
| 21787 |
+
"learning_rate": 0.0006965125158269618,
|
| 21788 |
+
"loss": 2.0767,
|
| 21789 |
+
"num_input_tokens_seen": 64162534656,
|
| 21790 |
+
"step": 122400
|
| 21791 |
+
},
|
| 21792 |
+
{
|
| 21793 |
+
"epoch": 1.1681815471576613,
|
| 21794 |
+
"grad_norm": 0.13262976706027985,
|
| 21795 |
+
"learning_rate": 0.0006939301676772927,
|
| 21796 |
+
"loss": 2.0662,
|
| 21797 |
+
"num_input_tokens_seen": 64188740064,
|
| 21798 |
+
"step": 122450
|
| 21799 |
+
},
|
| 21800 |
+
{
|
| 21801 |
+
"epoch": 1.1686585496738495,
|
| 21802 |
+
"grad_norm": 0.13390204310417175,
|
| 21803 |
+
"learning_rate": 0.000691341716182545,
|
| 21804 |
+
"loss": 2.0684,
|
| 21805 |
+
"num_input_tokens_seen": 64214942816,
|
| 21806 |
+
"step": 122500
|
| 21807 |
+
},
|
| 21808 |
+
{
|
| 21809 |
+
"epoch": 1.1686585496738495,
|
| 21810 |
+
"eval_loss": 1.9892343282699585,
|
| 21811 |
+
"eval_runtime": 81.7351,
|
| 21812 |
+
"eval_samples_per_second": 61.173,
|
| 21813 |
+
"eval_steps_per_second": 15.293,
|
| 21814 |
+
"num_input_tokens_seen": 64214942816,
|
| 21815 |
+
"step": 122500
|
| 21816 |
+
},
|
| 21817 |
+
{
|
| 21818 |
+
"epoch": 1.1691355521900377,
|
| 21819 |
+
"grad_norm": 0.14351387321949005,
|
| 21820 |
+
"learning_rate": 0.0006887472428061285,
|
| 21821 |
+
"loss": 2.0611,
|
| 21822 |
+
"num_input_tokens_seen": 64241151872,
|
| 21823 |
+
"step": 122550
|
| 21824 |
+
},
|
| 21825 |
+
{
|
| 21826 |
+
"epoch": 1.1696125547062262,
|
| 21827 |
+
"grad_norm": 0.1321556568145752,
|
| 21828 |
+
"learning_rate": 0.0006861468292009726,
|
| 21829 |
+
"loss": 2.0726,
|
| 21830 |
+
"num_input_tokens_seen": 64267354176,
|
| 21831 |
+
"step": 122600
|
| 21832 |
+
},
|
| 21833 |
+
{
|
| 21834 |
+
"epoch": 1.1700895572224144,
|
| 21835 |
+
"grad_norm": 0.12825502455234528,
|
| 21836 |
+
"learning_rate": 0.0006835405572069572,
|
| 21837 |
+
"loss": 2.0703,
|
| 21838 |
+
"num_input_tokens_seen": 64293568544,
|
| 21839 |
+
"step": 122650
|
| 21840 |
+
},
|
| 21841 |
+
{
|
| 21842 |
+
"epoch": 1.1705665597386026,
|
| 21843 |
+
"grad_norm": 0.1376345157623291,
|
| 21844 |
+
"learning_rate": 0.0006809285088483361,
|
| 21845 |
+
"loss": 2.0789,
|
| 21846 |
+
"num_input_tokens_seen": 64319782944,
|
| 21847 |
+
"step": 122700
|
| 21848 |
+
},
|
| 21849 |
+
{
|
| 21850 |
+
"epoch": 1.1710435622547908,
|
| 21851 |
+
"grad_norm": 0.14178837835788727,
|
| 21852 |
+
"learning_rate": 0.0006783107663311565,
|
| 21853 |
+
"loss": 2.0755,
|
| 21854 |
+
"num_input_tokens_seen": 64345996064,
|
| 21855 |
+
"step": 122750
|
| 21856 |
+
},
|
| 21857 |
+
{
|
| 21858 |
+
"epoch": 1.1715205647709792,
|
| 21859 |
+
"grad_norm": 0.1475340873003006,
|
| 21860 |
+
"learning_rate": 0.0006756874120406714,
|
| 21861 |
+
"loss": 2.0668,
|
| 21862 |
+
"num_input_tokens_seen": 64372202944,
|
| 21863 |
+
"step": 122800
|
| 21864 |
+
},
|
| 21865 |
+
{
|
| 21866 |
+
"epoch": 1.1719975672871674,
|
| 21867 |
+
"grad_norm": 0.13012921810150146,
|
| 21868 |
+
"learning_rate": 0.0006730585285387465,
|
| 21869 |
+
"loss": 2.0618,
|
| 21870 |
+
"num_input_tokens_seen": 64398414944,
|
| 21871 |
+
"step": 122850
|
| 21872 |
+
},
|
| 21873 |
+
{
|
| 21874 |
+
"epoch": 1.1724745698033556,
|
| 21875 |
+
"grad_norm": 0.13203522562980652,
|
| 21876 |
+
"learning_rate": 0.0006704241985612625,
|
| 21877 |
+
"loss": 2.0712,
|
| 21878 |
+
"num_input_tokens_seen": 64424627264,
|
| 21879 |
+
"step": 122900
|
| 21880 |
+
},
|
| 21881 |
+
{
|
| 21882 |
+
"epoch": 1.172951572319544,
|
| 21883 |
+
"grad_norm": 0.13648848235607147,
|
| 21884 |
+
"learning_rate": 0.0006677845050155106,
|
| 21885 |
+
"loss": 2.0694,
|
| 21886 |
+
"num_input_tokens_seen": 64450839392,
|
| 21887 |
+
"step": 122950
|
| 21888 |
+
},
|
| 21889 |
+
{
|
| 21890 |
+
"epoch": 1.1734285748357323,
|
| 21891 |
+
"grad_norm": 0.1383182257413864,
|
| 21892 |
+
"learning_rate": 0.0006651395309775837,
|
| 21893 |
+
"loss": 2.0564,
|
| 21894 |
+
"num_input_tokens_seen": 64477051392,
|
| 21895 |
+
"step": 123000
|
| 21896 |
+
},
|
| 21897 |
+
{
|
| 21898 |
+
"epoch": 1.1734285748357323,
|
| 21899 |
+
"eval_loss": 1.9881237745285034,
|
| 21900 |
+
"eval_runtime": 82.9953,
|
| 21901 |
+
"eval_samples_per_second": 60.244,
|
| 21902 |
+
"eval_steps_per_second": 15.061,
|
| 21903 |
+
"num_input_tokens_seen": 64477051392,
|
| 21904 |
+
"step": 123000
|
| 21905 |
}
|
| 21906 |
],
|
| 21907 |
"logging_steps": 50,
|
| 21908 |
"max_steps": 140000,
|
| 21909 |
+
"num_input_tokens_seen": 64477051392,
|
| 21910 |
"num_train_epochs": 2,
|
| 21911 |
"save_steps": 1000,
|
| 21912 |
"stateful_callbacks": {
|
|
|
|
| 21921 |
"attributes": {}
|
| 21922 |
}
|
| 21923 |
},
|
| 21924 |
+
"total_flos": 1.141126203496661e+20,
|
| 21925 |
"train_batch_size": 32,
|
| 21926 |
"trial_name": null,
|
| 21927 |
"trial_params": null
|