Training in progress, step 2000000
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scaler.pt +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +383 -3
- pytorch_model.bin +1 -1
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 893439185
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38f36466b9f2b124ce3950f4272937ae40e2fa26880ec00a4e1f83639190fb7d
|
| 3 |
size 893439185
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 449471589
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5fc35de7c7ab795f6ce22b4d822a3c81dd28eb6da159fa0e6bc70e2d249fbce8
|
| 3 |
size 449471589
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21579
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c86960e82d428869302623bd9f7002f37b98a8296d67cde31b64acf1793fdd0e
|
| 3 |
size 21579
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 559
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26c2c5dcfeda6d6eb5b101bdcd99b94aa97e0eb4affa75fa0e151082e701b9eb
|
| 3 |
size 559
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 623
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:994a0fabdb31bb0426e3f82b99b32aaddcc1766fdd4539450b1f928f65099fb8
|
| 3 |
size 623
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.
|
| 5 |
-
"global_step":
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
@@ -14826,11 +14826,391 @@
|
|
| 14826 |
"eval_samples_per_second": 82.821,
|
| 14827 |
"eval_steps_per_second": 0.647,
|
| 14828 |
"step": 1950000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14829 |
}
|
| 14830 |
],
|
| 14831 |
"max_steps": 2000000,
|
| 14832 |
"num_train_epochs": 9223372036854775807,
|
| 14833 |
-
"total_flos": 1.
|
| 14834 |
"trial_name": null,
|
| 14835 |
"trial_params": null
|
| 14836 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.1,
|
| 5 |
+
"global_step": 2000000,
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
|
|
| 14826 |
"eval_samples_per_second": 82.821,
|
| 14827 |
"eval_steps_per_second": 0.647,
|
| 14828 |
"step": 1950000
|
| 14829 |
+
},
|
| 14830 |
+
{
|
| 14831 |
+
"epoch": 0.08,
|
| 14832 |
+
"learning_rate": 1.026354625870075e-05,
|
| 14833 |
+
"loss": 0.4364,
|
| 14834 |
+
"step": 1951000
|
| 14835 |
+
},
|
| 14836 |
+
{
|
| 14837 |
+
"epoch": 0.08,
|
| 14838 |
+
"learning_rate": 1.0253060901106556e-05,
|
| 14839 |
+
"loss": 0.4361,
|
| 14840 |
+
"step": 1952000
|
| 14841 |
+
},
|
| 14842 |
+
{
|
| 14843 |
+
"epoch": 0.08,
|
| 14844 |
+
"learning_rate": 1.0242798171546145e-05,
|
| 14845 |
+
"loss": 0.4365,
|
| 14846 |
+
"step": 1953000
|
| 14847 |
+
},
|
| 14848 |
+
{
|
| 14849 |
+
"epoch": 0.08,
|
| 14850 |
+
"learning_rate": 1.0232747509747644e-05,
|
| 14851 |
+
"loss": 0.4373,
|
| 14852 |
+
"step": 1954000
|
| 14853 |
+
},
|
| 14854 |
+
{
|
| 14855 |
+
"epoch": 0.08,
|
| 14856 |
+
"learning_rate": 1.0222899204125646e-05,
|
| 14857 |
+
"loss": 0.4362,
|
| 14858 |
+
"step": 1955000
|
| 14859 |
+
},
|
| 14860 |
+
{
|
| 14861 |
+
"epoch": 0.08,
|
| 14862 |
+
"eval_loss": 0.4164978265762329,
|
| 14863 |
+
"eval_runtime": 80.0596,
|
| 14864 |
+
"eval_samples_per_second": 79.94,
|
| 14865 |
+
"eval_steps_per_second": 0.625,
|
| 14866 |
+
"step": 1955000
|
| 14867 |
+
},
|
| 14868 |
+
{
|
| 14869 |
+
"epoch": 0.08,
|
| 14870 |
+
"learning_rate": 1.0213263451653737e-05,
|
| 14871 |
+
"loss": 0.4367,
|
| 14872 |
+
"step": 1956000
|
| 14873 |
+
},
|
| 14874 |
+
{
|
| 14875 |
+
"epoch": 0.08,
|
| 14876 |
+
"learning_rate": 1.0203849598659497e-05,
|
| 14877 |
+
"loss": 0.4367,
|
| 14878 |
+
"step": 1957000
|
| 14879 |
+
},
|
| 14880 |
+
{
|
| 14881 |
+
"epoch": 0.08,
|
| 14882 |
+
"learning_rate": 1.0194638827271399e-05,
|
| 14883 |
+
"loss": 0.4364,
|
| 14884 |
+
"step": 1958000
|
| 14885 |
+
},
|
| 14886 |
+
{
|
| 14887 |
+
"epoch": 0.08,
|
| 14888 |
+
"learning_rate": 1.0185640695119401e-05,
|
| 14889 |
+
"loss": 0.4363,
|
| 14890 |
+
"step": 1959000
|
| 14891 |
+
},
|
| 14892 |
+
{
|
| 14893 |
+
"epoch": 0.08,
|
| 14894 |
+
"learning_rate": 1.017685522961337e-05,
|
| 14895 |
+
"loss": 0.4362,
|
| 14896 |
+
"step": 1960000
|
| 14897 |
+
},
|
| 14898 |
+
{
|
| 14899 |
+
"epoch": 0.08,
|
| 14900 |
+
"eval_loss": 0.42052188515663147,
|
| 14901 |
+
"eval_runtime": 77.8558,
|
| 14902 |
+
"eval_samples_per_second": 82.203,
|
| 14903 |
+
"eval_steps_per_second": 0.642,
|
| 14904 |
+
"step": 1960000
|
| 14905 |
+
},
|
| 14906 |
+
{
|
| 14907 |
+
"epoch": 0.08,
|
| 14908 |
+
"learning_rate": 1.0168282457515363e-05,
|
| 14909 |
+
"loss": 0.4369,
|
| 14910 |
+
"step": 1961000
|
| 14911 |
+
},
|
| 14912 |
+
{
|
| 14913 |
+
"epoch": 0.08,
|
| 14914 |
+
"learning_rate": 1.0159930658730172e-05,
|
| 14915 |
+
"loss": 0.4364,
|
| 14916 |
+
"step": 1962000
|
| 14917 |
+
},
|
| 14918 |
+
{
|
| 14919 |
+
"epoch": 0.08,
|
| 14920 |
+
"learning_rate": 1.0151791179631108e-05,
|
| 14921 |
+
"loss": 0.4359,
|
| 14922 |
+
"step": 1963000
|
| 14923 |
+
},
|
| 14924 |
+
{
|
| 14925 |
+
"epoch": 0.08,
|
| 14926 |
+
"learning_rate": 1.0143856216286122e-05,
|
| 14927 |
+
"loss": 0.4368,
|
| 14928 |
+
"step": 1964000
|
| 14929 |
+
},
|
| 14930 |
+
{
|
| 14931 |
+
"epoch": 0.08,
|
| 14932 |
+
"learning_rate": 1.0136134046869866e-05,
|
| 14933 |
+
"loss": 0.4357,
|
| 14934 |
+
"step": 1965000
|
| 14935 |
+
},
|
| 14936 |
+
{
|
| 14937 |
+
"epoch": 0.08,
|
| 14938 |
+
"eval_loss": 0.41740044951438904,
|
| 14939 |
+
"eval_runtime": 78.1991,
|
| 14940 |
+
"eval_samples_per_second": 81.842,
|
| 14941 |
+
"eval_steps_per_second": 0.639,
|
| 14942 |
+
"step": 1965000
|
| 14943 |
+
},
|
| 14944 |
+
{
|
| 14945 |
+
"epoch": 0.08,
|
| 14946 |
+
"learning_rate": 1.0128632097947403e-05,
|
| 14947 |
+
"loss": 0.4365,
|
| 14948 |
+
"step": 1966000
|
| 14949 |
+
},
|
| 14950 |
+
{
|
| 14951 |
+
"epoch": 0.08,
|
| 14952 |
+
"learning_rate": 1.0121335373458022e-05,
|
| 14953 |
+
"loss": 0.4362,
|
| 14954 |
+
"step": 1967000
|
| 14955 |
+
},
|
| 14956 |
+
{
|
| 14957 |
+
"epoch": 0.08,
|
| 14958 |
+
"learning_rate": 1.011425151149977e-05,
|
| 14959 |
+
"loss": 0.4361,
|
| 14960 |
+
"step": 1968000
|
| 14961 |
+
},
|
| 14962 |
+
{
|
| 14963 |
+
"epoch": 0.08,
|
| 14964 |
+
"learning_rate": 1.010738729828653e-05,
|
| 14965 |
+
"loss": 0.4375,
|
| 14966 |
+
"step": 1969000
|
| 14967 |
+
},
|
| 14968 |
+
{
|
| 14969 |
+
"epoch": 0.09,
|
| 14970 |
+
"learning_rate": 1.0100729012562797e-05,
|
| 14971 |
+
"loss": 0.4372,
|
| 14972 |
+
"step": 1970000
|
| 14973 |
+
},
|
| 14974 |
+
{
|
| 14975 |
+
"epoch": 0.09,
|
| 14976 |
+
"eval_loss": 0.4145086705684662,
|
| 14977 |
+
"eval_runtime": 79.8319,
|
| 14978 |
+
"eval_samples_per_second": 80.168,
|
| 14979 |
+
"eval_steps_per_second": 0.626,
|
| 14980 |
+
"step": 1970000
|
| 14981 |
+
},
|
| 14982 |
+
{
|
| 14983 |
+
"epoch": 0.09,
|
| 14984 |
+
"learning_rate": 1.0094289991138392e-05,
|
| 14985 |
+
"loss": 0.4363,
|
| 14986 |
+
"step": 1971000
|
| 14987 |
+
},
|
| 14988 |
+
{
|
| 14989 |
+
"epoch": 0.09,
|
| 14990 |
+
"learning_rate": 1.0088057362697175e-05,
|
| 14991 |
+
"loss": 0.4375,
|
| 14992 |
+
"step": 1972000
|
| 14993 |
+
},
|
| 14994 |
+
{
|
| 14995 |
+
"epoch": 0.09,
|
| 14996 |
+
"learning_rate": 1.0082049524936494e-05,
|
| 14997 |
+
"loss": 0.4372,
|
| 14998 |
+
"step": 1973000
|
| 14999 |
+
},
|
| 15000 |
+
{
|
| 15001 |
+
"epoch": 0.09,
|
| 15002 |
+
"learning_rate": 1.0076242416653332e-05,
|
| 15003 |
+
"loss": 0.4349,
|
| 15004 |
+
"step": 1974000
|
| 15005 |
+
},
|
| 15006 |
+
{
|
| 15007 |
+
"epoch": 0.09,
|
| 15008 |
+
"learning_rate": 1.0070648308262255e-05,
|
| 15009 |
+
"loss": 0.436,
|
| 15010 |
+
"step": 1975000
|
| 15011 |
+
},
|
| 15012 |
+
{
|
| 15013 |
+
"epoch": 0.09,
|
| 15014 |
+
"eval_loss": 0.4151042103767395,
|
| 15015 |
+
"eval_runtime": 79.0273,
|
| 15016 |
+
"eval_samples_per_second": 80.985,
|
| 15017 |
+
"eval_steps_per_second": 0.633,
|
| 15018 |
+
"step": 1975000
|
| 15019 |
+
},
|
| 15020 |
+
{
|
| 15021 |
+
"epoch": 0.09,
|
| 15022 |
+
"learning_rate": 1.006526721680391e-05,
|
| 15023 |
+
"loss": 0.4342,
|
| 15024 |
+
"step": 1976000
|
| 15025 |
+
},
|
| 15026 |
+
{
|
| 15027 |
+
"epoch": 0.09,
|
| 15028 |
+
"learning_rate": 1.0060099158670026e-05,
|
| 15029 |
+
"loss": 0.4363,
|
| 15030 |
+
"step": 1977000
|
| 15031 |
+
},
|
| 15032 |
+
{
|
| 15033 |
+
"epoch": 0.09,
|
| 15034 |
+
"learning_rate": 1.0055148998189381e-05,
|
| 15035 |
+
"loss": 0.437,
|
| 15036 |
+
"step": 1978000
|
| 15037 |
+
},
|
| 15038 |
+
{
|
| 15039 |
+
"epoch": 0.09,
|
| 15040 |
+
"learning_rate": 1.0050411475939925e-05,
|
| 15041 |
+
"loss": 0.436,
|
| 15042 |
+
"step": 1979000
|
| 15043 |
+
},
|
| 15044 |
+
{
|
| 15045 |
+
"epoch": 0.09,
|
| 15046 |
+
"learning_rate": 1.0045882183469046e-05,
|
| 15047 |
+
"loss": 0.4355,
|
| 15048 |
+
"step": 1980000
|
| 15049 |
+
},
|
| 15050 |
+
{
|
| 15051 |
+
"epoch": 0.09,
|
| 15052 |
+
"eval_loss": 0.4141569435596466,
|
| 15053 |
+
"eval_runtime": 79.5726,
|
| 15054 |
+
"eval_samples_per_second": 80.43,
|
| 15055 |
+
"eval_steps_per_second": 0.628,
|
| 15056 |
+
"step": 1980000
|
| 15057 |
+
},
|
| 15058 |
+
{
|
| 15059 |
+
"epoch": 0.09,
|
| 15060 |
+
"learning_rate": 1.0041565983372807e-05,
|
| 15061 |
+
"loss": 0.4359,
|
| 15062 |
+
"step": 1981000
|
| 15063 |
+
},
|
| 15064 |
+
{
|
| 15065 |
+
"epoch": 0.09,
|
| 15066 |
+
"learning_rate": 1.0037462888799093e-05,
|
| 15067 |
+
"loss": 0.4362,
|
| 15068 |
+
"step": 1982000
|
| 15069 |
+
},
|
| 15070 |
+
{
|
| 15071 |
+
"epoch": 0.09,
|
| 15072 |
+
"learning_rate": 1.0033576695766748e-05,
|
| 15073 |
+
"loss": 0.4376,
|
| 15074 |
+
"step": 1983000
|
| 15075 |
+
},
|
| 15076 |
+
{
|
| 15077 |
+
"epoch": 0.09,
|
| 15078 |
+
"learning_rate": 1.0029899635949539e-05,
|
| 15079 |
+
"loss": 0.4373,
|
| 15080 |
+
"step": 1984000
|
| 15081 |
+
},
|
| 15082 |
+
{
|
| 15083 |
+
"epoch": 0.09,
|
| 15084 |
+
"learning_rate": 1.0026435717192568e-05,
|
| 15085 |
+
"loss": 0.4367,
|
| 15086 |
+
"step": 1985000
|
| 15087 |
+
},
|
| 15088 |
+
{
|
| 15089 |
+
"epoch": 0.09,
|
| 15090 |
+
"eval_loss": 0.4171934127807617,
|
| 15091 |
+
"eval_runtime": 77.9474,
|
| 15092 |
+
"eval_samples_per_second": 82.107,
|
| 15093 |
+
"eval_steps_per_second": 0.641,
|
| 15094 |
+
"step": 1985000
|
| 15095 |
+
},
|
| 15096 |
+
{
|
| 15097 |
+
"epoch": 0.09,
|
| 15098 |
+
"learning_rate": 1.0023184950047551e-05,
|
| 15099 |
+
"loss": 0.4361,
|
| 15100 |
+
"step": 1986000
|
| 15101 |
+
},
|
| 15102 |
+
{
|
| 15103 |
+
"epoch": 0.09,
|
| 15104 |
+
"learning_rate": 1.002015027554519e-05,
|
| 15105 |
+
"loss": 0.4377,
|
| 15106 |
+
"step": 1987000
|
| 15107 |
+
},
|
| 15108 |
+
{
|
| 15109 |
+
"epoch": 0.09,
|
| 15110 |
+
"learning_rate": 1.0017325627506754e-05,
|
| 15111 |
+
"loss": 0.4373,
|
| 15112 |
+
"step": 1988000
|
| 15113 |
+
},
|
| 15114 |
+
{
|
| 15115 |
+
"epoch": 0.09,
|
| 15116 |
+
"learning_rate": 1.0014716663814055e-05,
|
| 15117 |
+
"loss": 0.4368,
|
| 15118 |
+
"step": 1989000
|
| 15119 |
+
},
|
| 15120 |
+
{
|
| 15121 |
+
"epoch": 0.1,
|
| 15122 |
+
"learning_rate": 1.0012320461270247e-05,
|
| 15123 |
+
"loss": 0.4358,
|
| 15124 |
+
"step": 1990000
|
| 15125 |
+
},
|
| 15126 |
+
{
|
| 15127 |
+
"epoch": 0.1,
|
| 15128 |
+
"eval_loss": 0.41612717509269714,
|
| 15129 |
+
"eval_runtime": 80.5577,
|
| 15130 |
+
"eval_samples_per_second": 79.446,
|
| 15131 |
+
"eval_steps_per_second": 0.621,
|
| 15132 |
+
"step": 1990000
|
| 15133 |
+
},
|
| 15134 |
+
{
|
| 15135 |
+
"epoch": 0.1,
|
| 15136 |
+
"learning_rate": 1.0010134948139825e-05,
|
| 15137 |
+
"loss": 0.4366,
|
| 15138 |
+
"step": 1991000
|
| 15139 |
+
},
|
| 15140 |
+
{
|
| 15141 |
+
"epoch": 0.1,
|
| 15142 |
+
"learning_rate": 1.0008162636276321e-05,
|
| 15143 |
+
"loss": 0.4369,
|
| 15144 |
+
"step": 1992000
|
| 15145 |
+
},
|
| 15146 |
+
{
|
| 15147 |
+
"epoch": 0.1,
|
| 15148 |
+
"learning_rate": 1.0006403531687724e-05,
|
| 15149 |
+
"loss": 0.4372,
|
| 15150 |
+
"step": 1993000
|
| 15151 |
+
},
|
| 15152 |
+
{
|
| 15153 |
+
"epoch": 0.1,
|
| 15154 |
+
"learning_rate": 1.0004859079123212e-05,
|
| 15155 |
+
"loss": 0.4361,
|
| 15156 |
+
"step": 1994000
|
| 15157 |
+
},
|
| 15158 |
+
{
|
| 15159 |
+
"epoch": 0.1,
|
| 15160 |
+
"learning_rate": 1.0003526191291106e-05,
|
| 15161 |
+
"loss": 0.4369,
|
| 15162 |
+
"step": 1995000
|
| 15163 |
+
},
|
| 15164 |
+
{
|
| 15165 |
+
"epoch": 0.1,
|
| 15166 |
+
"eval_loss": 0.4170204997062683,
|
| 15167 |
+
"eval_runtime": 80.1918,
|
| 15168 |
+
"eval_samples_per_second": 79.809,
|
| 15169 |
+
"eval_steps_per_second": 0.624,
|
| 15170 |
+
"step": 1995000
|
| 15171 |
+
},
|
| 15172 |
+
{
|
| 15173 |
+
"epoch": 0.1,
|
| 15174 |
+
"learning_rate": 1.0002406524857334e-05,
|
| 15175 |
+
"loss": 0.436,
|
| 15176 |
+
"step": 1996000
|
| 15177 |
+
},
|
| 15178 |
+
{
|
| 15179 |
+
"epoch": 0.1,
|
| 15180 |
+
"learning_rate": 1.0001500883167451e-05,
|
| 15181 |
+
"loss": 0.4372,
|
| 15182 |
+
"step": 1997000
|
| 15183 |
+
},
|
| 15184 |
+
{
|
| 15185 |
+
"epoch": 0.1,
|
| 15186 |
+
"learning_rate": 1.0000807455884181e-05,
|
| 15187 |
+
"loss": 0.4369,
|
| 15188 |
+
"step": 1998000
|
| 15189 |
+
},
|
| 15190 |
+
{
|
| 15191 |
+
"epoch": 0.1,
|
| 15192 |
+
"learning_rate": 1.0000327631969819e-05,
|
| 15193 |
+
"loss": 0.4362,
|
| 15194 |
+
"step": 1999000
|
| 15195 |
+
},
|
| 15196 |
+
{
|
| 15197 |
+
"epoch": 0.1,
|
| 15198 |
+
"learning_rate": 1.00000604522778e-05,
|
| 15199 |
+
"loss": 0.4363,
|
| 15200 |
+
"step": 2000000
|
| 15201 |
+
},
|
| 15202 |
+
{
|
| 15203 |
+
"epoch": 0.1,
|
| 15204 |
+
"eval_loss": 0.41442054510116577,
|
| 15205 |
+
"eval_runtime": 79.9098,
|
| 15206 |
+
"eval_samples_per_second": 80.09,
|
| 15207 |
+
"eval_steps_per_second": 0.626,
|
| 15208 |
+
"step": 2000000
|
| 15209 |
}
|
| 15210 |
],
|
| 15211 |
"max_steps": 2000000,
|
| 15212 |
"num_train_epochs": 9223372036854775807,
|
| 15213 |
+
"total_flos": 1.752506547830784e+22,
|
| 15214 |
"trial_name": null,
|
| 15215 |
"trial_params": null
|
| 15216 |
}
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 449471589
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5fc35de7c7ab795f6ce22b4d822a3c81dd28eb6da159fa0e6bc70e2d249fbce8
|
| 3 |
size 449471589
|