Training in progress, epoch 20, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1227009528
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f875ff0bfa8c3f03718200317018a9c1320ef659ed8be49eb8d1545f90dca2b
|
| 3 |
size 1227009528
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2454133690
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4edde3d63fc51cb76d2b8798e35123bee17f10b37ac6770074f24fbb1849dc32
|
| 3 |
size 2454133690
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b0066a3b21610aa70bfcf0b5c4ca5da7f43ab12c9e601ab15813e745474a36d
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8c5b80067b711daea816f97793263fb6b6d08534034a3999a4ce7590fa85de8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": 34.54485321044922,
|
| 3 |
"best_model_checkpoint": "/kaggle/working/output/checkpoint-20880",
|
| 4 |
-
"epoch":
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -1888,6 +1888,112 @@
|
|
| 1888 |
"eval_samples_per_second": 26.464,
|
| 1889 |
"eval_steps_per_second": 3.326,
|
| 1890 |
"step": 24795
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1891 |
}
|
| 1892 |
],
|
| 1893 |
"logging_steps": 100,
|
|
@@ -1902,7 +2008,7 @@
|
|
| 1902 |
"early_stopping_threshold": 0.0
|
| 1903 |
},
|
| 1904 |
"attributes": {
|
| 1905 |
-
"early_stopping_patience_counter":
|
| 1906 |
}
|
| 1907 |
},
|
| 1908 |
"TrainerControl": {
|
|
@@ -1916,7 +2022,7 @@
|
|
| 1916 |
"attributes": {}
|
| 1917 |
}
|
| 1918 |
},
|
| 1919 |
-
"total_flos": 2.
|
| 1920 |
"train_batch_size": 8,
|
| 1921 |
"trial_name": null,
|
| 1922 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": 34.54485321044922,
|
| 3 |
"best_model_checkpoint": "/kaggle/working/output/checkpoint-20880",
|
| 4 |
+
"epoch": 20.0,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 26100,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 1888 |
"eval_samples_per_second": 26.464,
|
| 1889 |
"eval_steps_per_second": 3.326,
|
| 1890 |
"step": 24795
|
| 1891 |
+
},
|
| 1892 |
+
{
|
| 1893 |
+
"epoch": 19.00383141762452,
|
| 1894 |
+
"grad_norm": 3.04927659034729,
|
| 1895 |
+
"learning_rate": 3.8127873563218394e-05,
|
| 1896 |
+
"loss": 33.7055,
|
| 1897 |
+
"step": 24800
|
| 1898 |
+
},
|
| 1899 |
+
{
|
| 1900 |
+
"epoch": 19.080459770114942,
|
| 1901 |
+
"grad_norm": 2.725443124771118,
|
| 1902 |
+
"learning_rate": 3.8079980842911874e-05,
|
| 1903 |
+
"loss": 33.5355,
|
| 1904 |
+
"step": 24900
|
| 1905 |
+
},
|
| 1906 |
+
{
|
| 1907 |
+
"epoch": 19.157088122605366,
|
| 1908 |
+
"grad_norm": 3.853895425796509,
|
| 1909 |
+
"learning_rate": 3.803208812260536e-05,
|
| 1910 |
+
"loss": 33.5267,
|
| 1911 |
+
"step": 25000
|
| 1912 |
+
},
|
| 1913 |
+
{
|
| 1914 |
+
"epoch": 19.233716475095786,
|
| 1915 |
+
"grad_norm": 2.666419267654419,
|
| 1916 |
+
"learning_rate": 3.798419540229885e-05,
|
| 1917 |
+
"loss": 33.4069,
|
| 1918 |
+
"step": 25100
|
| 1919 |
+
},
|
| 1920 |
+
{
|
| 1921 |
+
"epoch": 19.310344827586206,
|
| 1922 |
+
"grad_norm": 3.5618317127227783,
|
| 1923 |
+
"learning_rate": 3.793630268199234e-05,
|
| 1924 |
+
"loss": 33.7295,
|
| 1925 |
+
"step": 25200
|
| 1926 |
+
},
|
| 1927 |
+
{
|
| 1928 |
+
"epoch": 19.386973180076627,
|
| 1929 |
+
"grad_norm": 3.351062297821045,
|
| 1930 |
+
"learning_rate": 3.788840996168583e-05,
|
| 1931 |
+
"loss": 33.1994,
|
| 1932 |
+
"step": 25300
|
| 1933 |
+
},
|
| 1934 |
+
{
|
| 1935 |
+
"epoch": 19.46360153256705,
|
| 1936 |
+
"grad_norm": 3.3226547241210938,
|
| 1937 |
+
"learning_rate": 3.7840996168582374e-05,
|
| 1938 |
+
"loss": 33.3149,
|
| 1939 |
+
"step": 25400
|
| 1940 |
+
},
|
| 1941 |
+
{
|
| 1942 |
+
"epoch": 19.54022988505747,
|
| 1943 |
+
"grad_norm": 4.15867805480957,
|
| 1944 |
+
"learning_rate": 3.779310344827586e-05,
|
| 1945 |
+
"loss": 33.5592,
|
| 1946 |
+
"step": 25500
|
| 1947 |
+
},
|
| 1948 |
+
{
|
| 1949 |
+
"epoch": 19.61685823754789,
|
| 1950 |
+
"grad_norm": 2.333674430847168,
|
| 1951 |
+
"learning_rate": 3.774521072796935e-05,
|
| 1952 |
+
"loss": 33.7336,
|
| 1953 |
+
"step": 25600
|
| 1954 |
+
},
|
| 1955 |
+
{
|
| 1956 |
+
"epoch": 19.693486590038315,
|
| 1957 |
+
"grad_norm": 2.9516782760620117,
|
| 1958 |
+
"learning_rate": 3.7697318007662834e-05,
|
| 1959 |
+
"loss": 33.3228,
|
| 1960 |
+
"step": 25700
|
| 1961 |
+
},
|
| 1962 |
+
{
|
| 1963 |
+
"epoch": 19.770114942528735,
|
| 1964 |
+
"grad_norm": 1.734508991241455,
|
| 1965 |
+
"learning_rate": 3.764942528735632e-05,
|
| 1966 |
+
"loss": 33.3216,
|
| 1967 |
+
"step": 25800
|
| 1968 |
+
},
|
| 1969 |
+
{
|
| 1970 |
+
"epoch": 19.846743295019156,
|
| 1971 |
+
"grad_norm": 2.4886648654937744,
|
| 1972 |
+
"learning_rate": 3.760153256704981e-05,
|
| 1973 |
+
"loss": 33.5157,
|
| 1974 |
+
"step": 25900
|
| 1975 |
+
},
|
| 1976 |
+
{
|
| 1977 |
+
"epoch": 19.92337164750958,
|
| 1978 |
+
"grad_norm": 3.6624252796173096,
|
| 1979 |
+
"learning_rate": 3.75536398467433e-05,
|
| 1980 |
+
"loss": 33.2399,
|
| 1981 |
+
"step": 26000
|
| 1982 |
+
},
|
| 1983 |
+
{
|
| 1984 |
+
"epoch": 20.0,
|
| 1985 |
+
"grad_norm": 4.810445785522461,
|
| 1986 |
+
"learning_rate": 3.750574712643679e-05,
|
| 1987 |
+
"loss": 32.548,
|
| 1988 |
+
"step": 26100
|
| 1989 |
+
},
|
| 1990 |
+
{
|
| 1991 |
+
"epoch": 20.0,
|
| 1992 |
+
"eval_loss": 34.746856689453125,
|
| 1993 |
+
"eval_runtime": 49.2861,
|
| 1994 |
+
"eval_samples_per_second": 26.478,
|
| 1995 |
+
"eval_steps_per_second": 3.328,
|
| 1996 |
+
"step": 26100
|
| 1997 |
}
|
| 1998 |
],
|
| 1999 |
"logging_steps": 100,
|
|
|
|
| 2008 |
"early_stopping_threshold": 0.0
|
| 2009 |
},
|
| 2010 |
"attributes": {
|
| 2011 |
+
"early_stopping_patience_counter": 4
|
| 2012 |
}
|
| 2013 |
},
|
| 2014 |
"TrainerControl": {
|
|
|
|
| 2022 |
"attributes": {}
|
| 2023 |
}
|
| 2024 |
},
|
| 2025 |
+
"total_flos": 2.814621806094336e+16,
|
| 2026 |
"train_batch_size": 8,
|
| 2027 |
"trial_name": null,
|
| 2028 |
"trial_params": null
|