Upload folder using huggingface_hub
Browse files- adapter_model.safetensors +2 -2
- checkpoint-100/adapter_model.safetensors +2 -2
- checkpoint-100/optimizer.pt +1 -1
- checkpoint-100/rng_state.pth +1 -1
- checkpoint-100/scheduler.pt +1 -1
- checkpoint-100/trainer_state.json +61 -61
- checkpoint-100/training_args.bin +1 -1
- checkpoint-20/adapter_model.safetensors +2 -2
- checkpoint-20/optimizer.pt +1 -1
- checkpoint-20/rng_state.pth +1 -1
- checkpoint-20/scheduler.pt +1 -1
- checkpoint-20/trainer_state.json +13 -13
- checkpoint-20/training_args.bin +1 -1
- checkpoint-40/adapter_model.safetensors +2 -2
- checkpoint-40/optimizer.pt +1 -1
- checkpoint-40/rng_state.pth +1 -1
- checkpoint-40/scheduler.pt +1 -1
- checkpoint-40/trainer_state.json +25 -25
- checkpoint-40/training_args.bin +1 -1
- checkpoint-60/adapter_model.safetensors +2 -2
- checkpoint-60/optimizer.pt +1 -1
- checkpoint-60/rng_state.pth +1 -1
- checkpoint-60/scheduler.pt +1 -1
- checkpoint-60/trainer_state.json +37 -37
- checkpoint-60/training_args.bin +1 -1
- checkpoint-80/adapter_model.safetensors +2 -2
- checkpoint-80/optimizer.pt +1 -1
- checkpoint-80/rng_state.pth +1 -1
- checkpoint-80/scheduler.pt +1 -1
- checkpoint-80/trainer_state.json +49 -49
- checkpoint-80/training_args.bin +1 -1
- training_args.bin +1 -1
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e39308711dbd127efe41850085dc7f2f63d54a44aad2dd2a502891474924b787
|
| 3 |
+
size 83946192
|
checkpoint-100/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e39308711dbd127efe41850085dc7f2f63d54a44aad2dd2a502891474924b787
|
| 3 |
+
size 83946192
|
checkpoint-100/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 335810482
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dacc62b0f0a2216d58e92701202dd5456a079db701e9e321345271c623dd7c6a
|
| 3 |
size 335810482
|
checkpoint-100/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2975b104fc6f24da571473b9b64f8d64d931a14d44726583f5951b3fe5be12b9
|
| 3 |
size 14168
|
checkpoint-100/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dddbc5bcee87f33f86ec113accc5c003fca74582bc423aa05d433c41c6d3cf4e
|
| 3 |
size 1056
|
checkpoint-100/trainer_state.json
CHANGED
|
@@ -10,142 +10,142 @@
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
-
"grad_norm": 5.
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
-
"loss": 1.
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
-
"grad_norm":
|
| 21 |
-
"learning_rate": 2.
|
| 22 |
-
"loss": 1.
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
-
"grad_norm": 3.
|
| 28 |
-
"learning_rate": 2.
|
| 29 |
-
"loss": 1.
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
-
"grad_norm": 3.
|
| 35 |
-
"learning_rate": 2.
|
| 36 |
-
"loss": 1.
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
-
"grad_norm": 4.
|
| 42 |
-
"learning_rate": 2.
|
| 43 |
-
"loss": 1.
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
-
"grad_norm": 3.
|
| 49 |
-
"learning_rate": 2.
|
| 50 |
-
"loss": 1.
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
-
"grad_norm": 4.
|
| 56 |
-
"learning_rate": 2.
|
| 57 |
-
"loss": 1.
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
-
"grad_norm": 3.
|
| 63 |
-
"learning_rate": 2.
|
| 64 |
-
"loss": 1.
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 1.84,
|
| 69 |
-
"grad_norm": 3.
|
| 70 |
-
"learning_rate": 1.
|
| 71 |
-
"loss": 1.
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 2.04,
|
| 76 |
-
"grad_norm": 3.
|
| 77 |
-
"learning_rate": 1.
|
| 78 |
-
"loss": 1.
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 2.24,
|
| 83 |
-
"grad_norm": 3.
|
| 84 |
-
"learning_rate": 1.
|
| 85 |
-
"loss": 1.
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 2.45,
|
| 90 |
-
"grad_norm": 3.
|
| 91 |
-
"learning_rate": 1.
|
| 92 |
-
"loss": 1.
|
| 93 |
"step": 60
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"epoch": 2.65,
|
| 97 |
-
"grad_norm":
|
| 98 |
-
"learning_rate":
|
| 99 |
-
"loss": 1.
|
| 100 |
"step": 65
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 2.86,
|
| 104 |
-
"grad_norm": 3.
|
| 105 |
-
"learning_rate": 6.
|
| 106 |
-
"loss": 1.
|
| 107 |
"step": 70
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"epoch": 3.06,
|
| 111 |
-
"grad_norm":
|
| 112 |
-
"learning_rate":
|
| 113 |
-
"loss": 1.
|
| 114 |
"step": 75
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"epoch": 3.27,
|
| 118 |
-
"grad_norm": 3.
|
| 119 |
-
"learning_rate": 3.
|
| 120 |
-
"loss": 1.
|
| 121 |
"step": 80
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"epoch": 3.47,
|
| 125 |
-
"grad_norm": 3.
|
| 126 |
-
"learning_rate": 1.
|
| 127 |
-
"loss": 1.
|
| 128 |
"step": 85
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"epoch": 3.67,
|
| 132 |
-
"grad_norm": 3.
|
| 133 |
-
"learning_rate":
|
| 134 |
-
"loss": 1.
|
| 135 |
"step": 90
|
| 136 |
},
|
| 137 |
{
|
| 138 |
"epoch": 3.88,
|
| 139 |
-
"grad_norm": 3.
|
| 140 |
-
"learning_rate":
|
| 141 |
-
"loss": 1.
|
| 142 |
"step": 95
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"epoch": 4.08,
|
| 146 |
-
"grad_norm": 3.
|
| 147 |
-
"learning_rate":
|
| 148 |
-
"loss": 1.
|
| 149 |
"step": 100
|
| 150 |
}
|
| 151 |
],
|
|
@@ -154,8 +154,8 @@
|
|
| 154 |
"num_input_tokens_seen": 0,
|
| 155 |
"num_train_epochs": 5,
|
| 156 |
"save_steps": 20,
|
| 157 |
-
"total_flos":
|
| 158 |
-
"train_batch_size":
|
| 159 |
"trial_name": null,
|
| 160 |
"trial_params": null
|
| 161 |
}
|
|
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
+
"grad_norm": 5.09375,
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
+
"loss": 1.9388,
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
+
"grad_norm": 4.4375,
|
| 21 |
+
"learning_rate": 2.9616157869703894e-06,
|
| 22 |
+
"loss": 1.8455,
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
+
"grad_norm": 3.59375,
|
| 28 |
+
"learning_rate": 2.8881318444640566e-06,
|
| 29 |
+
"loss": 1.7796,
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
+
"grad_norm": 3.796875,
|
| 35 |
+
"learning_rate": 2.778325235483954e-06,
|
| 36 |
+
"loss": 1.8091,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
+
"grad_norm": 4.625,
|
| 42 |
+
"learning_rate": 2.6350692237265428e-06,
|
| 43 |
+
"loss": 1.7224,
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
+
"grad_norm": 3.953125,
|
| 49 |
+
"learning_rate": 2.4621123294467098e-06,
|
| 50 |
+
"loss": 1.7108,
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
+
"grad_norm": 4.125,
|
| 56 |
+
"learning_rate": 2.2639802434931445e-06,
|
| 57 |
+
"loss": 1.7299,
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
+
"grad_norm": 3.625,
|
| 63 |
+
"learning_rate": 2.0458574054452316e-06,
|
| 64 |
+
"loss": 1.7111,
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 1.84,
|
| 69 |
+
"grad_norm": 3.53125,
|
| 70 |
+
"learning_rate": 1.813451344546913e-06,
|
| 71 |
+
"loss": 1.7364,
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 2.04,
|
| 76 |
+
"grad_norm": 3.1875,
|
| 77 |
+
"learning_rate": 1.5728433331716726e-06,
|
| 78 |
+
"loss": 1.6664,
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 2.24,
|
| 83 |
+
"grad_norm": 3.125,
|
| 84 |
+
"learning_rate": 1.3303292607070737e-06,
|
| 85 |
+
"loss": 1.6673,
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 2.45,
|
| 90 |
+
"grad_norm": 3.65625,
|
| 91 |
+
"learning_rate": 1.0922548916454855e-06,
|
| 92 |
+
"loss": 1.6219,
|
| 93 |
"step": 60
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"epoch": 2.65,
|
| 97 |
+
"grad_norm": 2.8125,
|
| 98 |
+
"learning_rate": 8.648498186137653e-07,
|
| 99 |
+
"loss": 1.6648,
|
| 100 |
"step": 65
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 2.86,
|
| 104 |
+
"grad_norm": 3.53125,
|
| 105 |
+
"learning_rate": 6.540644552236401e-07,
|
| 106 |
+
"loss": 1.699,
|
| 107 |
"step": 70
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"epoch": 3.06,
|
| 111 |
+
"grad_norm": 4.15625,
|
| 112 |
+
"learning_rate": 4.6541433408284356e-07,
|
| 113 |
+
"loss": 1.6821,
|
| 114 |
"step": 75
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"epoch": 3.27,
|
| 118 |
+
"grad_norm": 3.546875,
|
| 119 |
+
"learning_rate": 3.0383578415591913e-07,
|
| 120 |
+
"loss": 1.6633,
|
| 121 |
"step": 80
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"epoch": 3.47,
|
| 125 |
+
"grad_norm": 3.4375,
|
| 126 |
+
"learning_rate": 1.7355676390496482e-07,
|
| 127 |
+
"loss": 1.6522,
|
| 128 |
"step": 85
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"epoch": 3.67,
|
| 132 |
+
"grad_norm": 3.328125,
|
| 133 |
+
"learning_rate": 7.798623006559436e-08,
|
| 134 |
+
"loss": 1.6788,
|
| 135 |
"step": 90
|
| 136 |
},
|
| 137 |
{
|
| 138 |
"epoch": 3.88,
|
| 139 |
+
"grad_norm": 3.828125,
|
| 140 |
+
"learning_rate": 1.962493689916395e-08,
|
| 141 |
+
"loss": 1.6806,
|
| 142 |
"step": 95
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"epoch": 4.08,
|
| 146 |
+
"grad_norm": 3.25,
|
| 147 |
+
"learning_rate": 0.0,
|
| 148 |
+
"loss": 1.6213,
|
| 149 |
"step": 100
|
| 150 |
}
|
| 151 |
],
|
|
|
|
| 154 |
"num_input_tokens_seen": 0,
|
| 155 |
"num_train_epochs": 5,
|
| 156 |
"save_steps": 20,
|
| 157 |
+
"total_flos": 8016985030459392.0,
|
| 158 |
+
"train_batch_size": 4,
|
| 159 |
"trial_name": null,
|
| 160 |
"trial_params": null
|
| 161 |
}
|
checkpoint-100/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
|
| 3 |
size 4960
|
checkpoint-20/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36fe3540199ef008a5a14901c68ed9f51bd9b4d479e2c02c0a2a74bf9ab9c08d
|
| 3 |
+
size 83946192
|
checkpoint-20/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 335810482
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba79b0978d0a6742535ddd213b0cf9c7209903a3c55809669cb1c9d19ccf9abc
|
| 3 |
size 335810482
|
checkpoint-20/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:596bb0c43f17f8cd0971123a502b06f192d7a434146d9d5e3e84fb081424cc46
|
| 3 |
size 14168
|
checkpoint-20/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2256134ffb225a6c790a5cebe9b44be1002bae7b80db44f96adfc6030072a13c
|
| 3 |
size 1056
|
checkpoint-20/trainer_state.json
CHANGED
|
@@ -10,30 +10,30 @@
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
-
"grad_norm": 5.
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
-
"loss": 1.
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
-
"grad_norm":
|
| 21 |
-
"learning_rate": 2.
|
| 22 |
-
"loss": 1.
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
-
"grad_norm": 3.
|
| 28 |
-
"learning_rate": 2.
|
| 29 |
-
"loss": 1.
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
-
"grad_norm": 3.
|
| 35 |
-
"learning_rate": 2.
|
| 36 |
-
"loss": 1.
|
| 37 |
"step": 20
|
| 38 |
}
|
| 39 |
],
|
|
@@ -42,8 +42,8 @@
|
|
| 42 |
"num_input_tokens_seen": 0,
|
| 43 |
"num_train_epochs": 5,
|
| 44 |
"save_steps": 20,
|
| 45 |
-
"total_flos":
|
| 46 |
-
"train_batch_size":
|
| 47 |
"trial_name": null,
|
| 48 |
"trial_params": null
|
| 49 |
}
|
|
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
+
"grad_norm": 5.09375,
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
+
"loss": 1.9388,
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
+
"grad_norm": 4.4375,
|
| 21 |
+
"learning_rate": 2.9616157869703894e-06,
|
| 22 |
+
"loss": 1.8455,
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
+
"grad_norm": 3.59375,
|
| 28 |
+
"learning_rate": 2.8881318444640566e-06,
|
| 29 |
+
"loss": 1.7796,
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
+
"grad_norm": 3.796875,
|
| 35 |
+
"learning_rate": 2.778325235483954e-06,
|
| 36 |
+
"loss": 1.8091,
|
| 37 |
"step": 20
|
| 38 |
}
|
| 39 |
],
|
|
|
|
| 42 |
"num_input_tokens_seen": 0,
|
| 43 |
"num_train_epochs": 5,
|
| 44 |
"save_steps": 20,
|
| 45 |
+
"total_flos": 1604332476923904.0,
|
| 46 |
+
"train_batch_size": 4,
|
| 47 |
"trial_name": null,
|
| 48 |
"trial_params": null
|
| 49 |
}
|
checkpoint-20/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
|
| 3 |
size 4960
|
checkpoint-40/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab1d62447cec6baff4b897987488e96f3e155ae7a2d789c9bfd44df3352413bd
|
| 3 |
+
size 83946192
|
checkpoint-40/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 335810482
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb9a12580e8a534dab4a9273a9183c0f3caabe7de1e89240f077919754dc8398
|
| 3 |
size 335810482
|
checkpoint-40/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:076780903adeff97a16d6f25588ee658c45903a68edf79adba5d800f18428061
|
| 3 |
size 14168
|
checkpoint-40/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aaf7ad3bbb37486a5e7658ddf05bfee6df67222659155f89c9b8f68d5f791717
|
| 3 |
size 1056
|
checkpoint-40/trainer_state.json
CHANGED
|
@@ -10,58 +10,58 @@
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
-
"grad_norm": 5.
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
-
"loss": 1.
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
-
"grad_norm":
|
| 21 |
-
"learning_rate": 2.
|
| 22 |
-
"loss": 1.
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
-
"grad_norm": 3.
|
| 28 |
-
"learning_rate": 2.
|
| 29 |
-
"loss": 1.
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
-
"grad_norm": 3.
|
| 35 |
-
"learning_rate": 2.
|
| 36 |
-
"loss": 1.
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
-
"grad_norm": 4.
|
| 42 |
-
"learning_rate": 2.
|
| 43 |
-
"loss": 1.
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
-
"grad_norm": 3.
|
| 49 |
-
"learning_rate": 2.
|
| 50 |
-
"loss": 1.
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
-
"grad_norm": 4.
|
| 56 |
-
"learning_rate": 2.
|
| 57 |
-
"loss": 1.
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
-
"grad_norm": 3.
|
| 63 |
-
"learning_rate": 2.
|
| 64 |
-
"loss": 1.
|
| 65 |
"step": 40
|
| 66 |
}
|
| 67 |
],
|
|
@@ -70,8 +70,8 @@
|
|
| 70 |
"num_input_tokens_seen": 0,
|
| 71 |
"num_train_epochs": 5,
|
| 72 |
"save_steps": 20,
|
| 73 |
-
"total_flos":
|
| 74 |
-
"train_batch_size":
|
| 75 |
"trial_name": null,
|
| 76 |
"trial_params": null
|
| 77 |
}
|
|
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
+
"grad_norm": 5.09375,
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
+
"loss": 1.9388,
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
+
"grad_norm": 4.4375,
|
| 21 |
+
"learning_rate": 2.9616157869703894e-06,
|
| 22 |
+
"loss": 1.8455,
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
+
"grad_norm": 3.59375,
|
| 28 |
+
"learning_rate": 2.8881318444640566e-06,
|
| 29 |
+
"loss": 1.7796,
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
+
"grad_norm": 3.796875,
|
| 35 |
+
"learning_rate": 2.778325235483954e-06,
|
| 36 |
+
"loss": 1.8091,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
+
"grad_norm": 4.625,
|
| 42 |
+
"learning_rate": 2.6350692237265428e-06,
|
| 43 |
+
"loss": 1.7224,
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
+
"grad_norm": 3.953125,
|
| 49 |
+
"learning_rate": 2.4621123294467098e-06,
|
| 50 |
+
"loss": 1.7108,
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
+
"grad_norm": 4.125,
|
| 56 |
+
"learning_rate": 2.2639802434931445e-06,
|
| 57 |
+
"loss": 1.7299,
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
+
"grad_norm": 3.625,
|
| 63 |
+
"learning_rate": 2.0458574054452316e-06,
|
| 64 |
+
"loss": 1.7111,
|
| 65 |
"step": 40
|
| 66 |
}
|
| 67 |
],
|
|
|
|
| 70 |
"num_input_tokens_seen": 0,
|
| 71 |
"num_train_epochs": 5,
|
| 72 |
"save_steps": 20,
|
| 73 |
+
"total_flos": 3199050392518656.0,
|
| 74 |
+
"train_batch_size": 4,
|
| 75 |
"trial_name": null,
|
| 76 |
"trial_params": null
|
| 77 |
}
|
checkpoint-40/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
|
| 3 |
size 4960
|
checkpoint-60/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0b479837a2f06b820e0f0f448720fe1bd18ae1237586386c794257aec6a6aef
|
| 3 |
+
size 83946192
|
checkpoint-60/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 335810482
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc5a078967486289c7bb2846375d3eb6741292205506fe5dde1c1a54fd203bf7
|
| 3 |
size 335810482
|
checkpoint-60/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fe60fa4f0a2e1e83d88e7fff8399bc59bb2f8ab03ac19802ae736a7dbddb571
|
| 3 |
size 14168
|
checkpoint-60/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d151571d18043fd71e8a555e30e88db47c2b573382d14ae55c5df3ad38604822
|
| 3 |
size 1056
|
checkpoint-60/trainer_state.json
CHANGED
|
@@ -10,86 +10,86 @@
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
-
"grad_norm": 5.
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
-
"loss": 1.
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
-
"grad_norm":
|
| 21 |
-
"learning_rate": 2.
|
| 22 |
-
"loss": 1.
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
-
"grad_norm": 3.
|
| 28 |
-
"learning_rate": 2.
|
| 29 |
-
"loss": 1.
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
-
"grad_norm": 3.
|
| 35 |
-
"learning_rate": 2.
|
| 36 |
-
"loss": 1.
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
-
"grad_norm": 4.
|
| 42 |
-
"learning_rate": 2.
|
| 43 |
-
"loss": 1.
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
-
"grad_norm": 3.
|
| 49 |
-
"learning_rate": 2.
|
| 50 |
-
"loss": 1.
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
-
"grad_norm": 4.
|
| 56 |
-
"learning_rate": 2.
|
| 57 |
-
"loss": 1.
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
-
"grad_norm": 3.
|
| 63 |
-
"learning_rate": 2.
|
| 64 |
-
"loss": 1.
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 1.84,
|
| 69 |
-
"grad_norm": 3.
|
| 70 |
-
"learning_rate": 1.
|
| 71 |
-
"loss": 1.
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 2.04,
|
| 76 |
-
"grad_norm": 3.
|
| 77 |
-
"learning_rate": 1.
|
| 78 |
-
"loss": 1.
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 2.24,
|
| 83 |
-
"grad_norm": 3.
|
| 84 |
-
"learning_rate": 1.
|
| 85 |
-
"loss": 1.
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 2.45,
|
| 90 |
-
"grad_norm": 3.
|
| 91 |
-
"learning_rate": 1.
|
| 92 |
-
"loss": 1.
|
| 93 |
"step": 60
|
| 94 |
}
|
| 95 |
],
|
|
@@ -98,8 +98,8 @@
|
|
| 98 |
"num_input_tokens_seen": 0,
|
| 99 |
"num_train_epochs": 5,
|
| 100 |
"save_steps": 20,
|
| 101 |
-
"total_flos":
|
| 102 |
-
"train_batch_size":
|
| 103 |
"trial_name": null,
|
| 104 |
"trial_params": null
|
| 105 |
}
|
|
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
+
"grad_norm": 5.09375,
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
+
"loss": 1.9388,
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
+
"grad_norm": 4.4375,
|
| 21 |
+
"learning_rate": 2.9616157869703894e-06,
|
| 22 |
+
"loss": 1.8455,
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
+
"grad_norm": 3.59375,
|
| 28 |
+
"learning_rate": 2.8881318444640566e-06,
|
| 29 |
+
"loss": 1.7796,
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
+
"grad_norm": 3.796875,
|
| 35 |
+
"learning_rate": 2.778325235483954e-06,
|
| 36 |
+
"loss": 1.8091,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
+
"grad_norm": 4.625,
|
| 42 |
+
"learning_rate": 2.6350692237265428e-06,
|
| 43 |
+
"loss": 1.7224,
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
+
"grad_norm": 3.953125,
|
| 49 |
+
"learning_rate": 2.4621123294467098e-06,
|
| 50 |
+
"loss": 1.7108,
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
+
"grad_norm": 4.125,
|
| 56 |
+
"learning_rate": 2.2639802434931445e-06,
|
| 57 |
+
"loss": 1.7299,
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
+
"grad_norm": 3.625,
|
| 63 |
+
"learning_rate": 2.0458574054452316e-06,
|
| 64 |
+
"loss": 1.7111,
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 1.84,
|
| 69 |
+
"grad_norm": 3.53125,
|
| 70 |
+
"learning_rate": 1.813451344546913e-06,
|
| 71 |
+
"loss": 1.7364,
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 2.04,
|
| 76 |
+
"grad_norm": 3.1875,
|
| 77 |
+
"learning_rate": 1.5728433331716726e-06,
|
| 78 |
+
"loss": 1.6664,
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 2.24,
|
| 83 |
+
"grad_norm": 3.125,
|
| 84 |
+
"learning_rate": 1.3303292607070737e-06,
|
| 85 |
+
"loss": 1.6673,
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 2.45,
|
| 90 |
+
"grad_norm": 3.65625,
|
| 91 |
+
"learning_rate": 1.0922548916454855e-06,
|
| 92 |
+
"loss": 1.6219,
|
| 93 |
"step": 60
|
| 94 |
}
|
| 95 |
],
|
|
|
|
| 98 |
"num_input_tokens_seen": 0,
|
| 99 |
"num_train_epochs": 5,
|
| 100 |
"save_steps": 20,
|
| 101 |
+
"total_flos": 4847990969303040.0,
|
| 102 |
+
"train_batch_size": 4,
|
| 103 |
"trial_name": null,
|
| 104 |
"trial_params": null
|
| 105 |
}
|
checkpoint-60/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
|
| 3 |
size 4960
|
checkpoint-80/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2072d0773bb7a3b3ac9c1b62ccf0be9fb811367ea38d87012e0fe0cd0fa4c99
|
| 3 |
+
size 83946192
|
checkpoint-80/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 335810482
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a3436433c51ee5552f4562b6706190e1b0b15ef3f5a6d76cc552fa0d29e0c6f
|
| 3 |
size 335810482
|
checkpoint-80/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39b06804188097f3c1bab9da775567b42104978432eb2ab2e415e5e56cb71c34
|
| 3 |
size 14168
|
checkpoint-80/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9a8053fc960722f2783f69c7ab9c51a9ad21ea164a693d17b94cded06192073
|
| 3 |
size 1056
|
checkpoint-80/trainer_state.json
CHANGED
|
@@ -10,114 +10,114 @@
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
-
"grad_norm": 5.
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
-
"loss": 1.
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
-
"grad_norm":
|
| 21 |
-
"learning_rate": 2.
|
| 22 |
-
"loss": 1.
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
-
"grad_norm": 3.
|
| 28 |
-
"learning_rate": 2.
|
| 29 |
-
"loss": 1.
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
-
"grad_norm": 3.
|
| 35 |
-
"learning_rate": 2.
|
| 36 |
-
"loss": 1.
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
-
"grad_norm": 4.
|
| 42 |
-
"learning_rate": 2.
|
| 43 |
-
"loss": 1.
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
-
"grad_norm": 3.
|
| 49 |
-
"learning_rate": 2.
|
| 50 |
-
"loss": 1.
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
-
"grad_norm": 4.
|
| 56 |
-
"learning_rate": 2.
|
| 57 |
-
"loss": 1.
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
-
"grad_norm": 3.
|
| 63 |
-
"learning_rate": 2.
|
| 64 |
-
"loss": 1.
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 1.84,
|
| 69 |
-
"grad_norm": 3.
|
| 70 |
-
"learning_rate": 1.
|
| 71 |
-
"loss": 1.
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 2.04,
|
| 76 |
-
"grad_norm": 3.
|
| 77 |
-
"learning_rate": 1.
|
| 78 |
-
"loss": 1.
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 2.24,
|
| 83 |
-
"grad_norm": 3.
|
| 84 |
-
"learning_rate": 1.
|
| 85 |
-
"loss": 1.
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 2.45,
|
| 90 |
-
"grad_norm": 3.
|
| 91 |
-
"learning_rate": 1.
|
| 92 |
-
"loss": 1.
|
| 93 |
"step": 60
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"epoch": 2.65,
|
| 97 |
-
"grad_norm":
|
| 98 |
-
"learning_rate":
|
| 99 |
-
"loss": 1.
|
| 100 |
"step": 65
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 2.86,
|
| 104 |
-
"grad_norm": 3.
|
| 105 |
-
"learning_rate": 6.
|
| 106 |
-
"loss": 1.
|
| 107 |
"step": 70
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"epoch": 3.06,
|
| 111 |
-
"grad_norm":
|
| 112 |
-
"learning_rate":
|
| 113 |
-
"loss": 1.
|
| 114 |
"step": 75
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"epoch": 3.27,
|
| 118 |
-
"grad_norm": 3.
|
| 119 |
-
"learning_rate": 3.
|
| 120 |
-
"loss": 1.
|
| 121 |
"step": 80
|
| 122 |
}
|
| 123 |
],
|
|
@@ -126,8 +126,8 @@
|
|
| 126 |
"num_input_tokens_seen": 0,
|
| 127 |
"num_train_epochs": 5,
|
| 128 |
"save_steps": 20,
|
| 129 |
-
"total_flos":
|
| 130 |
-
"train_batch_size":
|
| 131 |
"trial_name": null,
|
| 132 |
"trial_params": null
|
| 133 |
}
|
|
|
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.2,
|
| 13 |
+
"grad_norm": 5.09375,
|
| 14 |
"learning_rate": 2.9968542393565676e-06,
|
| 15 |
+
"loss": 1.9388,
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.41,
|
| 20 |
+
"grad_norm": 4.4375,
|
| 21 |
+
"learning_rate": 2.9616157869703894e-06,
|
| 22 |
+
"loss": 1.8455,
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.61,
|
| 27 |
+
"grad_norm": 3.59375,
|
| 28 |
+
"learning_rate": 2.8881318444640566e-06,
|
| 29 |
+
"loss": 1.7796,
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.82,
|
| 34 |
+
"grad_norm": 3.796875,
|
| 35 |
+
"learning_rate": 2.778325235483954e-06,
|
| 36 |
+
"loss": 1.8091,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.02,
|
| 41 |
+
"grad_norm": 4.625,
|
| 42 |
+
"learning_rate": 2.6350692237265428e-06,
|
| 43 |
+
"loss": 1.7224,
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 1.22,
|
| 48 |
+
"grad_norm": 3.953125,
|
| 49 |
+
"learning_rate": 2.4621123294467098e-06,
|
| 50 |
+
"loss": 1.7108,
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 1.43,
|
| 55 |
+
"grad_norm": 4.125,
|
| 56 |
+
"learning_rate": 2.2639802434931445e-06,
|
| 57 |
+
"loss": 1.7299,
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 1.63,
|
| 62 |
+
"grad_norm": 3.625,
|
| 63 |
+
"learning_rate": 2.0458574054452316e-06,
|
| 64 |
+
"loss": 1.7111,
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 1.84,
|
| 69 |
+
"grad_norm": 3.53125,
|
| 70 |
+
"learning_rate": 1.813451344546913e-06,
|
| 71 |
+
"loss": 1.7364,
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 2.04,
|
| 76 |
+
"grad_norm": 3.1875,
|
| 77 |
+
"learning_rate": 1.5728433331716726e-06,
|
| 78 |
+
"loss": 1.6664,
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 2.24,
|
| 83 |
+
"grad_norm": 3.125,
|
| 84 |
+
"learning_rate": 1.3303292607070737e-06,
|
| 85 |
+
"loss": 1.6673,
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 2.45,
|
| 90 |
+
"grad_norm": 3.65625,
|
| 91 |
+
"learning_rate": 1.0922548916454855e-06,
|
| 92 |
+
"loss": 1.6219,
|
| 93 |
"step": 60
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"epoch": 2.65,
|
| 97 |
+
"grad_norm": 2.8125,
|
| 98 |
+
"learning_rate": 8.648498186137653e-07,
|
| 99 |
+
"loss": 1.6648,
|
| 100 |
"step": 65
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 2.86,
|
| 104 |
+
"grad_norm": 3.53125,
|
| 105 |
+
"learning_rate": 6.540644552236401e-07,
|
| 106 |
+
"loss": 1.699,
|
| 107 |
"step": 70
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"epoch": 3.06,
|
| 111 |
+
"grad_norm": 4.15625,
|
| 112 |
+
"learning_rate": 4.6541433408284356e-07,
|
| 113 |
+
"loss": 1.6821,
|
| 114 |
"step": 75
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"epoch": 3.27,
|
| 118 |
+
"grad_norm": 3.546875,
|
| 119 |
+
"learning_rate": 3.0383578415591913e-07,
|
| 120 |
+
"loss": 1.6633,
|
| 121 |
"step": 80
|
| 122 |
}
|
| 123 |
],
|
|
|
|
| 126 |
"num_input_tokens_seen": 0,
|
| 127 |
"num_train_epochs": 5,
|
| 128 |
"save_steps": 20,
|
| 129 |
+
"total_flos": 6434740059291648.0,
|
| 130 |
+
"train_batch_size": 4,
|
| 131 |
"trial_name": null,
|
| 132 |
"trial_params": null
|
| 133 |
}
|
checkpoint-80/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
|
| 3 |
size 4960
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
|
| 3 |
size 4960
|