Commit ·
9ef5a3d
1
Parent(s): ecbca89
delete optimizer state
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- checkpoint-10/optimizer.pt +0 -3
- checkpoint-10/rng_state_0.pth +0 -3
- checkpoint-10/rng_state_1.pth +0 -3
- checkpoint-10/rng_state_2.pth +0 -3
- checkpoint-10/rng_state_3.pth +0 -3
- checkpoint-10/scheduler.pt +0 -3
- checkpoint-10/trainer_state.json +0 -28
- checkpoint-100/optimizer.pt +0 -3
- checkpoint-100/rng_state_0.pth +0 -3
- checkpoint-100/rng_state_1.pth +0 -3
- checkpoint-100/rng_state_2.pth +0 -3
- checkpoint-100/rng_state_3.pth +0 -3
- checkpoint-100/scheduler.pt +0 -3
- checkpoint-100/trainer_state.json +0 -91
- checkpoint-110/optimizer.pt +0 -3
- checkpoint-110/rng_state_0.pth +0 -3
- checkpoint-110/rng_state_1.pth +0 -3
- checkpoint-110/rng_state_2.pth +0 -3
- checkpoint-110/rng_state_3.pth +0 -3
- checkpoint-110/scheduler.pt +0 -3
- checkpoint-110/trainer_state.json +0 -98
- checkpoint-120/optimizer.pt +0 -3
- checkpoint-120/rng_state_0.pth +0 -3
- checkpoint-120/rng_state_1.pth +0 -3
- checkpoint-120/rng_state_2.pth +0 -3
- checkpoint-120/rng_state_3.pth +0 -3
- checkpoint-120/scheduler.pt +0 -3
- checkpoint-120/trainer_state.json +0 -105
- checkpoint-130/optimizer.pt +0 -3
- checkpoint-130/rng_state_0.pth +0 -3
- checkpoint-130/rng_state_1.pth +0 -3
- checkpoint-130/rng_state_2.pth +0 -3
- checkpoint-130/rng_state_3.pth +0 -3
- checkpoint-130/scheduler.pt +0 -3
- checkpoint-130/trainer_state.json +0 -112
- checkpoint-140/optimizer.pt +0 -3
- checkpoint-140/rng_state_0.pth +0 -3
- checkpoint-140/rng_state_1.pth +0 -3
- checkpoint-140/rng_state_2.pth +0 -3
- checkpoint-140/rng_state_3.pth +0 -3
- checkpoint-140/scheduler.pt +0 -3
- checkpoint-140/trainer_state.json +0 -119
- checkpoint-150/optimizer.pt +0 -3
- checkpoint-150/rng_state_0.pth +0 -3
- checkpoint-150/rng_state_1.pth +0 -3
- checkpoint-150/rng_state_2.pth +0 -3
- checkpoint-150/rng_state_3.pth +0 -3
- checkpoint-150/scheduler.pt +0 -3
- checkpoint-150/trainer_state.json +0 -126
- checkpoint-160/optimizer.pt +0 -3
checkpoint-10/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:28fa42275a7ad0b9233963bf130a7a38f00549d3464cd05287a011f7283b386e
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-10/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:279c231f7db5849b53ea6f61278709c8be27bcc46fc1b36100377bf36c55cfb9
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-10/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:35738ebb9e53709608b7f4feaf1edbde1a19901d813f15922153ded80ead6540
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-10/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3f411b8c60d90c0733bb03c4955ea2e40ab35464f214cb47cc4d6d0eaa83bc79
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-10/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7224ff493b87486a3e2c3001115ad539913e8fe95cf25f4bcae3236f97e83f41
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-10/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:284a8517b2310fb02b51d6b8ddd318d3f6e139475d47fc950976ac1287debd43
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-10/trainer_state.json
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.0007256367462448298,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 10,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
}
|
| 18 |
-
],
|
| 19 |
-
"logging_steps": 10,
|
| 20 |
-
"max_steps": 13781,
|
| 21 |
-
"num_input_tokens_seen": 0,
|
| 22 |
-
"num_train_epochs": 1,
|
| 23 |
-
"save_steps": 10,
|
| 24 |
-
"total_flos": 0.0,
|
| 25 |
-
"train_batch_size": 5,
|
| 26 |
-
"trial_name": null,
|
| 27 |
-
"trial_params": null
|
| 28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:80a21e2292a229656009d74eddf004f8818323f0e22e5f248423b3fb4a2ce550
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:85f43e42ff30186bb51f3d90dcd7d261d6e09960636961fd696f9478303d1331
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:daf210db9013b20416569b6811b878570fbbf461f867de41a8a69fd07f0d2c8c
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:132e0dad8f05cba3da38386b81951c801df7c5c2c1cf9e06b5d359b7b92422da
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:441e2aab46e3935d5d49029fda3ebaf07053ac3a8e8a6eb7aca038ab1127bea1
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:673ccf272a6ac969319297986ad3a484281eb54814eca9e7ffb558668525c080
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-100/trainer_state.json
DELETED
|
@@ -1,91 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.007256367462448298,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 100,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.0014512734924896596,
|
| 20 |
-
"grad_norm": 17.5,
|
| 21 |
-
"learning_rate": 1e-05,
|
| 22 |
-
"loss": 2.9076,
|
| 23 |
-
"step": 20
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.0021769102387344894,
|
| 27 |
-
"grad_norm": 19.125,
|
| 28 |
-
"learning_rate": 1.5e-05,
|
| 29 |
-
"loss": 3.0281,
|
| 30 |
-
"step": 30
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.0029025469849793192,
|
| 34 |
-
"grad_norm": 17.875,
|
| 35 |
-
"learning_rate": 2e-05,
|
| 36 |
-
"loss": 2.6225,
|
| 37 |
-
"step": 40
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.003628183731224149,
|
| 41 |
-
"grad_norm": 13.9375,
|
| 42 |
-
"learning_rate": 2.5e-05,
|
| 43 |
-
"loss": 2.5137,
|
| 44 |
-
"step": 50
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"epoch": 0.004353820477468979,
|
| 48 |
-
"grad_norm": 10.6875,
|
| 49 |
-
"learning_rate": 3e-05,
|
| 50 |
-
"loss": 1.994,
|
| 51 |
-
"step": 60
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"epoch": 0.005079457223713809,
|
| 55 |
-
"grad_norm": 7.53125,
|
| 56 |
-
"learning_rate": 3.5e-05,
|
| 57 |
-
"loss": 1.5519,
|
| 58 |
-
"step": 70
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"epoch": 0.0058050939699586385,
|
| 62 |
-
"grad_norm": 7.09375,
|
| 63 |
-
"learning_rate": 4e-05,
|
| 64 |
-
"loss": 1.2358,
|
| 65 |
-
"step": 80
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 0.006530730716203468,
|
| 69 |
-
"grad_norm": 8.0625,
|
| 70 |
-
"learning_rate": 4.5e-05,
|
| 71 |
-
"loss": 0.953,
|
| 72 |
-
"step": 90
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"epoch": 0.007256367462448298,
|
| 76 |
-
"grad_norm": 6.6875,
|
| 77 |
-
"learning_rate": 5e-05,
|
| 78 |
-
"loss": 0.6256,
|
| 79 |
-
"step": 100
|
| 80 |
-
}
|
| 81 |
-
],
|
| 82 |
-
"logging_steps": 10,
|
| 83 |
-
"max_steps": 13781,
|
| 84 |
-
"num_input_tokens_seen": 0,
|
| 85 |
-
"num_train_epochs": 1,
|
| 86 |
-
"save_steps": 10,
|
| 87 |
-
"total_flos": 0.0,
|
| 88 |
-
"train_batch_size": 5,
|
| 89 |
-
"trial_name": null,
|
| 90 |
-
"trial_params": null
|
| 91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7c1d05c28b69b29cab4f2f112063e58e2eb30366e9fd3470b771ab44766ba229
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ac95db18ae6a1e414f19563e15335ec1a3d44d5b26a3896a591a42bf53daac57
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9ad2adf8c9d84012d5c08bc34b7d7b7bd8f571238b97deba7b563bc8579f284e
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d02fc29e95ce367f0b8273bbbf6e41186c317282c9a486968d768ffcb716f8dd
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:540b9cf222feb8019c875aee3fd37ce5b892ea395b93ddd0b75459462687e321
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9f8534b1b0a88b0a2dee78197a1a006b8ead7b92df098f891541715a1dc34a24
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-110/trainer_state.json
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.007982004208693128,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 110,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.0014512734924896596,
|
| 20 |
-
"grad_norm": 17.5,
|
| 21 |
-
"learning_rate": 1e-05,
|
| 22 |
-
"loss": 2.9076,
|
| 23 |
-
"step": 20
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.0021769102387344894,
|
| 27 |
-
"grad_norm": 19.125,
|
| 28 |
-
"learning_rate": 1.5e-05,
|
| 29 |
-
"loss": 3.0281,
|
| 30 |
-
"step": 30
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.0029025469849793192,
|
| 34 |
-
"grad_norm": 17.875,
|
| 35 |
-
"learning_rate": 2e-05,
|
| 36 |
-
"loss": 2.6225,
|
| 37 |
-
"step": 40
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.003628183731224149,
|
| 41 |
-
"grad_norm": 13.9375,
|
| 42 |
-
"learning_rate": 2.5e-05,
|
| 43 |
-
"loss": 2.5137,
|
| 44 |
-
"step": 50
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"epoch": 0.004353820477468979,
|
| 48 |
-
"grad_norm": 10.6875,
|
| 49 |
-
"learning_rate": 3e-05,
|
| 50 |
-
"loss": 1.994,
|
| 51 |
-
"step": 60
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"epoch": 0.005079457223713809,
|
| 55 |
-
"grad_norm": 7.53125,
|
| 56 |
-
"learning_rate": 3.5e-05,
|
| 57 |
-
"loss": 1.5519,
|
| 58 |
-
"step": 70
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"epoch": 0.0058050939699586385,
|
| 62 |
-
"grad_norm": 7.09375,
|
| 63 |
-
"learning_rate": 4e-05,
|
| 64 |
-
"loss": 1.2358,
|
| 65 |
-
"step": 80
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 0.006530730716203468,
|
| 69 |
-
"grad_norm": 8.0625,
|
| 70 |
-
"learning_rate": 4.5e-05,
|
| 71 |
-
"loss": 0.953,
|
| 72 |
-
"step": 90
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"epoch": 0.007256367462448298,
|
| 76 |
-
"grad_norm": 6.6875,
|
| 77 |
-
"learning_rate": 5e-05,
|
| 78 |
-
"loss": 0.6256,
|
| 79 |
-
"step": 100
|
| 80 |
-
},
|
| 81 |
-
{
|
| 82 |
-
"epoch": 0.007982004208693128,
|
| 83 |
-
"grad_norm": 5.03125,
|
| 84 |
-
"learning_rate": 4.9999934086574596e-05,
|
| 85 |
-
"loss": 0.6601,
|
| 86 |
-
"step": 110
|
| 87 |
-
}
|
| 88 |
-
],
|
| 89 |
-
"logging_steps": 10,
|
| 90 |
-
"max_steps": 13781,
|
| 91 |
-
"num_input_tokens_seen": 0,
|
| 92 |
-
"num_train_epochs": 1,
|
| 93 |
-
"save_steps": 10,
|
| 94 |
-
"total_flos": 0.0,
|
| 95 |
-
"train_batch_size": 5,
|
| 96 |
-
"trial_name": null,
|
| 97 |
-
"trial_params": null
|
| 98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0b09d6d7d21066366d44b942d7a2b0b49d79b9fb3b2e17af43c09a19b626ed07
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e010d5c5e5ff459e09cee093e035058bce80bd0e562b9008cf49e37a37c4a265
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8ceaec8c84867fda1405ca685c206ff5498d51b755970edb435f4777d1649c24
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:791eebd4302125380e3da4e87668a4bb1db8af54a2e9f9519cb225a5eefb78b6
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7b5e8577ac55da749e71c292571c66ba7068eaeeac8f69a2d9ecb004c4ea24df
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ce3f2037fd0e6b38f795755403c77706eafd4be540aa363267475ac986bd25f5
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-120/trainer_state.json
DELETED
|
@@ -1,105 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.008707640954937958,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 120,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.0014512734924896596,
|
| 20 |
-
"grad_norm": 17.5,
|
| 21 |
-
"learning_rate": 1e-05,
|
| 22 |
-
"loss": 2.9076,
|
| 23 |
-
"step": 20
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.0021769102387344894,
|
| 27 |
-
"grad_norm": 19.125,
|
| 28 |
-
"learning_rate": 1.5e-05,
|
| 29 |
-
"loss": 3.0281,
|
| 30 |
-
"step": 30
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.0029025469849793192,
|
| 34 |
-
"grad_norm": 17.875,
|
| 35 |
-
"learning_rate": 2e-05,
|
| 36 |
-
"loss": 2.6225,
|
| 37 |
-
"step": 40
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.003628183731224149,
|
| 41 |
-
"grad_norm": 13.9375,
|
| 42 |
-
"learning_rate": 2.5e-05,
|
| 43 |
-
"loss": 2.5137,
|
| 44 |
-
"step": 50
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"epoch": 0.004353820477468979,
|
| 48 |
-
"grad_norm": 10.6875,
|
| 49 |
-
"learning_rate": 3e-05,
|
| 50 |
-
"loss": 1.994,
|
| 51 |
-
"step": 60
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"epoch": 0.005079457223713809,
|
| 55 |
-
"grad_norm": 7.53125,
|
| 56 |
-
"learning_rate": 3.5e-05,
|
| 57 |
-
"loss": 1.5519,
|
| 58 |
-
"step": 70
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"epoch": 0.0058050939699586385,
|
| 62 |
-
"grad_norm": 7.09375,
|
| 63 |
-
"learning_rate": 4e-05,
|
| 64 |
-
"loss": 1.2358,
|
| 65 |
-
"step": 80
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 0.006530730716203468,
|
| 69 |
-
"grad_norm": 8.0625,
|
| 70 |
-
"learning_rate": 4.5e-05,
|
| 71 |
-
"loss": 0.953,
|
| 72 |
-
"step": 90
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"epoch": 0.007256367462448298,
|
| 76 |
-
"grad_norm": 6.6875,
|
| 77 |
-
"learning_rate": 5e-05,
|
| 78 |
-
"loss": 0.6256,
|
| 79 |
-
"step": 100
|
| 80 |
-
},
|
| 81 |
-
{
|
| 82 |
-
"epoch": 0.007982004208693128,
|
| 83 |
-
"grad_norm": 5.03125,
|
| 84 |
-
"learning_rate": 4.9999934086574596e-05,
|
| 85 |
-
"loss": 0.6601,
|
| 86 |
-
"step": 110
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"epoch": 0.008707640954937958,
|
| 90 |
-
"grad_norm": 5.59375,
|
| 91 |
-
"learning_rate": 4.9999736346645943e-05,
|
| 92 |
-
"loss": 0.659,
|
| 93 |
-
"step": 120
|
| 94 |
-
}
|
| 95 |
-
],
|
| 96 |
-
"logging_steps": 10,
|
| 97 |
-
"max_steps": 13781,
|
| 98 |
-
"num_input_tokens_seen": 0,
|
| 99 |
-
"num_train_epochs": 1,
|
| 100 |
-
"save_steps": 10,
|
| 101 |
-
"total_flos": 0.0,
|
| 102 |
-
"train_batch_size": 5,
|
| 103 |
-
"trial_name": null,
|
| 104 |
-
"trial_params": null
|
| 105 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:30edf8ef192e630551b6f042d64674afb872062ad1f496520768b24e1d3fdb05
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7ced0c6ba13f477a0dbd44034592fe000f226e71898cbab5bf87ce59dc6bde36
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e97d793be909b79220b59b211d87fda9d35184d2305c00641e9b4531b73b8441
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:60416c656b12aaecd01e32e964532f371c0a6b02a4b9b91ccfdc35d45dce0050
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:218ddffb5978f25094e6ad3cfbfc85ad7b807a183e3bc9f6f15bd471542d7273
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c1cfc563a99d275c44661506e6df813c32ae4b04cafb6adaaa4449e91127f43d
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-130/trainer_state.json
DELETED
|
@@ -1,112 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.009433277701182788,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 130,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.0014512734924896596,
|
| 20 |
-
"grad_norm": 17.5,
|
| 21 |
-
"learning_rate": 1e-05,
|
| 22 |
-
"loss": 2.9076,
|
| 23 |
-
"step": 20
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.0021769102387344894,
|
| 27 |
-
"grad_norm": 19.125,
|
| 28 |
-
"learning_rate": 1.5e-05,
|
| 29 |
-
"loss": 3.0281,
|
| 30 |
-
"step": 30
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.0029025469849793192,
|
| 34 |
-
"grad_norm": 17.875,
|
| 35 |
-
"learning_rate": 2e-05,
|
| 36 |
-
"loss": 2.6225,
|
| 37 |
-
"step": 40
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.003628183731224149,
|
| 41 |
-
"grad_norm": 13.9375,
|
| 42 |
-
"learning_rate": 2.5e-05,
|
| 43 |
-
"loss": 2.5137,
|
| 44 |
-
"step": 50
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"epoch": 0.004353820477468979,
|
| 48 |
-
"grad_norm": 10.6875,
|
| 49 |
-
"learning_rate": 3e-05,
|
| 50 |
-
"loss": 1.994,
|
| 51 |
-
"step": 60
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"epoch": 0.005079457223713809,
|
| 55 |
-
"grad_norm": 7.53125,
|
| 56 |
-
"learning_rate": 3.5e-05,
|
| 57 |
-
"loss": 1.5519,
|
| 58 |
-
"step": 70
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"epoch": 0.0058050939699586385,
|
| 62 |
-
"grad_norm": 7.09375,
|
| 63 |
-
"learning_rate": 4e-05,
|
| 64 |
-
"loss": 1.2358,
|
| 65 |
-
"step": 80
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 0.006530730716203468,
|
| 69 |
-
"grad_norm": 8.0625,
|
| 70 |
-
"learning_rate": 4.5e-05,
|
| 71 |
-
"loss": 0.953,
|
| 72 |
-
"step": 90
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"epoch": 0.007256367462448298,
|
| 76 |
-
"grad_norm": 6.6875,
|
| 77 |
-
"learning_rate": 5e-05,
|
| 78 |
-
"loss": 0.6256,
|
| 79 |
-
"step": 100
|
| 80 |
-
},
|
| 81 |
-
{
|
| 82 |
-
"epoch": 0.007982004208693128,
|
| 83 |
-
"grad_norm": 5.03125,
|
| 84 |
-
"learning_rate": 4.9999934086574596e-05,
|
| 85 |
-
"loss": 0.6601,
|
| 86 |
-
"step": 110
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"epoch": 0.008707640954937958,
|
| 90 |
-
"grad_norm": 5.59375,
|
| 91 |
-
"learning_rate": 4.9999736346645943e-05,
|
| 92 |
-
"loss": 0.659,
|
| 93 |
-
"step": 120
|
| 94 |
-
},
|
| 95 |
-
{
|
| 96 |
-
"epoch": 0.009433277701182788,
|
| 97 |
-
"grad_norm": 3.953125,
|
| 98 |
-
"learning_rate": 4.999940678125673e-05,
|
| 99 |
-
"loss": 0.512,
|
| 100 |
-
"step": 130
|
| 101 |
-
}
|
| 102 |
-
],
|
| 103 |
-
"logging_steps": 10,
|
| 104 |
-
"max_steps": 13781,
|
| 105 |
-
"num_input_tokens_seen": 0,
|
| 106 |
-
"num_train_epochs": 1,
|
| 107 |
-
"save_steps": 10,
|
| 108 |
-
"total_flos": 0.0,
|
| 109 |
-
"train_batch_size": 5,
|
| 110 |
-
"trial_name": null,
|
| 111 |
-
"trial_params": null
|
| 112 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:be96e1e75721e995205be7310e523fdedd96da16498e62cade920fc940ae2b32
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6ad67dbef2a21b26f3117ca45d621957bf72b1116535cf6e524b17661b94b1a9
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ca3afd4f067268e4c6ff34242266c9e70bce106dd4d7365781bb893119a4033d
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f7a56e9bc058e763d68d477e80d923c2fe559a75d518ac8d5d693397a88304b3
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8aef494a54b19f2a6c92fb251d8acadbfc7c21bcba926f5a7f5fa134981bb678
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3f37f75b563805241f01236805ecd93e76fa35840939411337a4cd1f0771215b
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-140/trainer_state.json
DELETED
|
@@ -1,119 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.010158914447427617,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 140,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.0014512734924896596,
|
| 20 |
-
"grad_norm": 17.5,
|
| 21 |
-
"learning_rate": 1e-05,
|
| 22 |
-
"loss": 2.9076,
|
| 23 |
-
"step": 20
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.0021769102387344894,
|
| 27 |
-
"grad_norm": 19.125,
|
| 28 |
-
"learning_rate": 1.5e-05,
|
| 29 |
-
"loss": 3.0281,
|
| 30 |
-
"step": 30
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.0029025469849793192,
|
| 34 |
-
"grad_norm": 17.875,
|
| 35 |
-
"learning_rate": 2e-05,
|
| 36 |
-
"loss": 2.6225,
|
| 37 |
-
"step": 40
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.003628183731224149,
|
| 41 |
-
"grad_norm": 13.9375,
|
| 42 |
-
"learning_rate": 2.5e-05,
|
| 43 |
-
"loss": 2.5137,
|
| 44 |
-
"step": 50
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"epoch": 0.004353820477468979,
|
| 48 |
-
"grad_norm": 10.6875,
|
| 49 |
-
"learning_rate": 3e-05,
|
| 50 |
-
"loss": 1.994,
|
| 51 |
-
"step": 60
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"epoch": 0.005079457223713809,
|
| 55 |
-
"grad_norm": 7.53125,
|
| 56 |
-
"learning_rate": 3.5e-05,
|
| 57 |
-
"loss": 1.5519,
|
| 58 |
-
"step": 70
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"epoch": 0.0058050939699586385,
|
| 62 |
-
"grad_norm": 7.09375,
|
| 63 |
-
"learning_rate": 4e-05,
|
| 64 |
-
"loss": 1.2358,
|
| 65 |
-
"step": 80
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 0.006530730716203468,
|
| 69 |
-
"grad_norm": 8.0625,
|
| 70 |
-
"learning_rate": 4.5e-05,
|
| 71 |
-
"loss": 0.953,
|
| 72 |
-
"step": 90
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"epoch": 0.007256367462448298,
|
| 76 |
-
"grad_norm": 6.6875,
|
| 77 |
-
"learning_rate": 5e-05,
|
| 78 |
-
"loss": 0.6256,
|
| 79 |
-
"step": 100
|
| 80 |
-
},
|
| 81 |
-
{
|
| 82 |
-
"epoch": 0.007982004208693128,
|
| 83 |
-
"grad_norm": 5.03125,
|
| 84 |
-
"learning_rate": 4.9999934086574596e-05,
|
| 85 |
-
"loss": 0.6601,
|
| 86 |
-
"step": 110
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"epoch": 0.008707640954937958,
|
| 90 |
-
"grad_norm": 5.59375,
|
| 91 |
-
"learning_rate": 4.9999736346645943e-05,
|
| 92 |
-
"loss": 0.659,
|
| 93 |
-
"step": 120
|
| 94 |
-
},
|
| 95 |
-
{
|
| 96 |
-
"epoch": 0.009433277701182788,
|
| 97 |
-
"grad_norm": 3.953125,
|
| 98 |
-
"learning_rate": 4.999940678125673e-05,
|
| 99 |
-
"loss": 0.512,
|
| 100 |
-
"step": 130
|
| 101 |
-
},
|
| 102 |
-
{
|
| 103 |
-
"epoch": 0.010158914447427617,
|
| 104 |
-
"grad_norm": 5.46875,
|
| 105 |
-
"learning_rate": 4.9998945392144796e-05,
|
| 106 |
-
"loss": 0.5522,
|
| 107 |
-
"step": 140
|
| 108 |
-
}
|
| 109 |
-
],
|
| 110 |
-
"logging_steps": 10,
|
| 111 |
-
"max_steps": 13781,
|
| 112 |
-
"num_input_tokens_seen": 0,
|
| 113 |
-
"num_train_epochs": 1,
|
| 114 |
-
"save_steps": 10,
|
| 115 |
-
"total_flos": 0.0,
|
| 116 |
-
"train_batch_size": 5,
|
| 117 |
-
"trial_name": null,
|
| 118 |
-
"trial_params": null
|
| 119 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d9f864f1bca3aefbbc513b4a23725dbafb84a08442854a0d3825a3020cd44e50
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/rng_state_0.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c342af2ce35811f7314d04dcf27fe047ef7a2c2c65a53827cf5bfa3bbef9abbb
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/rng_state_1.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:732cda9265841aab840d0742ab54e81d4890cc436da4ad72a7491a2de6e456cd
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/rng_state_2.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b9e93aaf91c3d45dc0a00b2862a0b23147bc87200884e67202507624081ba206
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/rng_state_3.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bde38e9c7ebd4dcc6310f8e51cdb47e2f01b8ae902f2ef5613c6f4a36b2b5231
|
| 3 |
-
size 15024
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:dfbaeb5b70ff8c056924a39650a51a7bdb8be3dcfb3678304690b37e1bbf9e25
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-150/trainer_state.json
DELETED
|
@@ -1,126 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.010884551193672447,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 150,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.0007256367462448298,
|
| 13 |
-
"grad_norm": 15.6875,
|
| 14 |
-
"learning_rate": 5e-06,
|
| 15 |
-
"loss": 3.2562,
|
| 16 |
-
"step": 10
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.0014512734924896596,
|
| 20 |
-
"grad_norm": 17.5,
|
| 21 |
-
"learning_rate": 1e-05,
|
| 22 |
-
"loss": 2.9076,
|
| 23 |
-
"step": 20
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.0021769102387344894,
|
| 27 |
-
"grad_norm": 19.125,
|
| 28 |
-
"learning_rate": 1.5e-05,
|
| 29 |
-
"loss": 3.0281,
|
| 30 |
-
"step": 30
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.0029025469849793192,
|
| 34 |
-
"grad_norm": 17.875,
|
| 35 |
-
"learning_rate": 2e-05,
|
| 36 |
-
"loss": 2.6225,
|
| 37 |
-
"step": 40
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.003628183731224149,
|
| 41 |
-
"grad_norm": 13.9375,
|
| 42 |
-
"learning_rate": 2.5e-05,
|
| 43 |
-
"loss": 2.5137,
|
| 44 |
-
"step": 50
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"epoch": 0.004353820477468979,
|
| 48 |
-
"grad_norm": 10.6875,
|
| 49 |
-
"learning_rate": 3e-05,
|
| 50 |
-
"loss": 1.994,
|
| 51 |
-
"step": 60
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"epoch": 0.005079457223713809,
|
| 55 |
-
"grad_norm": 7.53125,
|
| 56 |
-
"learning_rate": 3.5e-05,
|
| 57 |
-
"loss": 1.5519,
|
| 58 |
-
"step": 70
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"epoch": 0.0058050939699586385,
|
| 62 |
-
"grad_norm": 7.09375,
|
| 63 |
-
"learning_rate": 4e-05,
|
| 64 |
-
"loss": 1.2358,
|
| 65 |
-
"step": 80
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 0.006530730716203468,
|
| 69 |
-
"grad_norm": 8.0625,
|
| 70 |
-
"learning_rate": 4.5e-05,
|
| 71 |
-
"loss": 0.953,
|
| 72 |
-
"step": 90
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"epoch": 0.007256367462448298,
|
| 76 |
-
"grad_norm": 6.6875,
|
| 77 |
-
"learning_rate": 5e-05,
|
| 78 |
-
"loss": 0.6256,
|
| 79 |
-
"step": 100
|
| 80 |
-
},
|
| 81 |
-
{
|
| 82 |
-
"epoch": 0.007982004208693128,
|
| 83 |
-
"grad_norm": 5.03125,
|
| 84 |
-
"learning_rate": 4.9999934086574596e-05,
|
| 85 |
-
"loss": 0.6601,
|
| 86 |
-
"step": 110
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"epoch": 0.008707640954937958,
|
| 90 |
-
"grad_norm": 5.59375,
|
| 91 |
-
"learning_rate": 4.9999736346645943e-05,
|
| 92 |
-
"loss": 0.659,
|
| 93 |
-
"step": 120
|
| 94 |
-
},
|
| 95 |
-
{
|
| 96 |
-
"epoch": 0.009433277701182788,
|
| 97 |
-
"grad_norm": 3.953125,
|
| 98 |
-
"learning_rate": 4.999940678125673e-05,
|
| 99 |
-
"loss": 0.512,
|
| 100 |
-
"step": 130
|
| 101 |
-
},
|
| 102 |
-
{
|
| 103 |
-
"epoch": 0.010158914447427617,
|
| 104 |
-
"grad_norm": 5.46875,
|
| 105 |
-
"learning_rate": 4.9998945392144796e-05,
|
| 106 |
-
"loss": 0.5522,
|
| 107 |
-
"step": 140
|
| 108 |
-
},
|
| 109 |
-
{
|
| 110 |
-
"epoch": 0.010884551193672447,
|
| 111 |
-
"grad_norm": 3.96875,
|
| 112 |
-
"learning_rate": 4.999835218174307e-05,
|
| 113 |
-
"loss": 0.5532,
|
| 114 |
-
"step": 150
|
| 115 |
-
}
|
| 116 |
-
],
|
| 117 |
-
"logging_steps": 10,
|
| 118 |
-
"max_steps": 13781,
|
| 119 |
-
"num_input_tokens_seen": 0,
|
| 120 |
-
"num_train_epochs": 1,
|
| 121 |
-
"save_steps": 10,
|
| 122 |
-
"total_flos": 0.0,
|
| 123 |
-
"train_batch_size": 5,
|
| 124 |
-
"trial_name": null,
|
| 125 |
-
"trial_params": null
|
| 126 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-160/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d47bf63b4d37493b7b937961025ca5642fb56ea1b0c57c6b7181d051550eb98e
|
| 3 |
-
size 11930938
|
|
|
|
|
|
|
|
|
|
|
|