Training in progress, step 13000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 891558696
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89d970941693f2a588e9760f579c7f92a3993862857386f4ce7a42732003bed9
|
| 3 |
size 891558696
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1783272762
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa728c323613f14720350a34b8de9e9bb0f00feb6895e9763f98b41fc90ba66a
|
| 3 |
size 1783272762
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f6fa31d853fe83023de7f7f07d4ad55cd60c82617211a8926ae6bb50464d9fc
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a8f1b9bc1d96d7439df35e8166ab30771f48f3a8a26970884d1d49063118f39
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": 0.0824647843837738,
|
| 3 |
"best_model_checkpoint": "./fine-tuned/checkpoint-12500",
|
| 4 |
-
"epoch": 1.
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -1957,6 +1957,84 @@
|
|
| 1957 |
"eval_samples_per_second": 22.714,
|
| 1958 |
"eval_steps_per_second": 5.678,
|
| 1959 |
"step": 12500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1960 |
}
|
| 1961 |
],
|
| 1962 |
"logging_steps": 50,
|
|
@@ -1976,7 +2054,7 @@
|
|
| 1976 |
"attributes": {}
|
| 1977 |
}
|
| 1978 |
},
|
| 1979 |
-
"total_flos": 3.
|
| 1980 |
"train_batch_size": 4,
|
| 1981 |
"trial_name": null,
|
| 1982 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": 0.0824647843837738,
|
| 3 |
"best_model_checkpoint": "./fine-tuned/checkpoint-12500",
|
| 4 |
+
"epoch": 1.04,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 13000,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 1957 |
"eval_samples_per_second": 22.714,
|
| 1958 |
"eval_steps_per_second": 5.678,
|
| 1959 |
"step": 12500
|
| 1960 |
+
},
|
| 1961 |
+
{
|
| 1962 |
+
"epoch": 1.004,
|
| 1963 |
+
"grad_norm": 0.12758083641529083,
|
| 1964 |
+
"learning_rate": 1.4942400000000001e-05,
|
| 1965 |
+
"loss": 0.0553,
|
| 1966 |
+
"step": 12550
|
| 1967 |
+
},
|
| 1968 |
+
{
|
| 1969 |
+
"epoch": 1.008,
|
| 1970 |
+
"grad_norm": 0.14093191921710968,
|
| 1971 |
+
"learning_rate": 1.4882400000000002e-05,
|
| 1972 |
+
"loss": 0.0572,
|
| 1973 |
+
"step": 12600
|
| 1974 |
+
},
|
| 1975 |
+
{
|
| 1976 |
+
"epoch": 1.012,
|
| 1977 |
+
"grad_norm": 0.20115911960601807,
|
| 1978 |
+
"learning_rate": 1.48224e-05,
|
| 1979 |
+
"loss": 0.055,
|
| 1980 |
+
"step": 12650
|
| 1981 |
+
},
|
| 1982 |
+
{
|
| 1983 |
+
"epoch": 1.016,
|
| 1984 |
+
"grad_norm": 0.2974820137023926,
|
| 1985 |
+
"learning_rate": 1.4762400000000001e-05,
|
| 1986 |
+
"loss": 0.0548,
|
| 1987 |
+
"step": 12700
|
| 1988 |
+
},
|
| 1989 |
+
{
|
| 1990 |
+
"epoch": 1.02,
|
| 1991 |
+
"grad_norm": 0.06170056387782097,
|
| 1992 |
+
"learning_rate": 1.47024e-05,
|
| 1993 |
+
"loss": 0.0528,
|
| 1994 |
+
"step": 12750
|
| 1995 |
+
},
|
| 1996 |
+
{
|
| 1997 |
+
"epoch": 1.024,
|
| 1998 |
+
"grad_norm": 0.18193961679935455,
|
| 1999 |
+
"learning_rate": 1.46424e-05,
|
| 2000 |
+
"loss": 0.0551,
|
| 2001 |
+
"step": 12800
|
| 2002 |
+
},
|
| 2003 |
+
{
|
| 2004 |
+
"epoch": 1.028,
|
| 2005 |
+
"grad_norm": 0.11086717993021011,
|
| 2006 |
+
"learning_rate": 1.4582400000000001e-05,
|
| 2007 |
+
"loss": 0.0533,
|
| 2008 |
+
"step": 12850
|
| 2009 |
+
},
|
| 2010 |
+
{
|
| 2011 |
+
"epoch": 1.032,
|
| 2012 |
+
"grad_norm": 0.1452319473028183,
|
| 2013 |
+
"learning_rate": 1.45224e-05,
|
| 2014 |
+
"loss": 0.0525,
|
| 2015 |
+
"step": 12900
|
| 2016 |
+
},
|
| 2017 |
+
{
|
| 2018 |
+
"epoch": 1.036,
|
| 2019 |
+
"grad_norm": 0.16397640109062195,
|
| 2020 |
+
"learning_rate": 1.44624e-05,
|
| 2021 |
+
"loss": 0.0622,
|
| 2022 |
+
"step": 12950
|
| 2023 |
+
},
|
| 2024 |
+
{
|
| 2025 |
+
"epoch": 1.04,
|
| 2026 |
+
"grad_norm": 0.09709367156028748,
|
| 2027 |
+
"learning_rate": 1.4402400000000001e-05,
|
| 2028 |
+
"loss": 0.0559,
|
| 2029 |
+
"step": 13000
|
| 2030 |
+
},
|
| 2031 |
+
{
|
| 2032 |
+
"epoch": 1.04,
|
| 2033 |
+
"eval_loss": 0.08302706480026245,
|
| 2034 |
+
"eval_runtime": 88.0122,
|
| 2035 |
+
"eval_samples_per_second": 22.724,
|
| 2036 |
+
"eval_steps_per_second": 5.681,
|
| 2037 |
+
"step": 13000
|
| 2038 |
}
|
| 2039 |
],
|
| 2040 |
"logging_steps": 50,
|
|
|
|
| 2054 |
"attributes": {}
|
| 2055 |
}
|
| 2056 |
},
|
| 2057 |
+
"total_flos": 3.166581030912e+16,
|
| 2058 |
"train_batch_size": 4,
|
| 2059 |
"trial_name": null,
|
| 2060 |
"trial_params": null
|