Training in progress, step 2100, checkpoint
Browse files- last-checkpoint/adapter_config.json +5 -5
- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scaler.pt +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +115 -3
- last-checkpoint/training_args.bin +1 -1
last-checkpoint/adapter_config.json
CHANGED
|
@@ -29,13 +29,13 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
-
"k_proj",
|
| 33 |
-
"down_proj",
|
| 34 |
-
"o_proj",
|
| 35 |
-
"q_proj",
|
| 36 |
"v_proj",
|
|
|
|
| 37 |
"gate_proj",
|
| 38 |
-
"
|
|
|
|
|
|
|
|
|
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"v_proj",
|
| 33 |
+
"k_proj",
|
| 34 |
"gate_proj",
|
| 35 |
+
"o_proj",
|
| 36 |
+
"up_proj",
|
| 37 |
+
"down_proj",
|
| 38 |
+
"q_proj"
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 228140600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9e46ae2720088669da0e7f9e660e9df21b3f13cd814ef2c054173a76a40c0a8
|
| 3 |
size 228140600
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 117931203
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76b3121a237388d42068dd86668509dc36abd8695d8ccbfd6fb7b924e1a73d7f
|
| 3 |
size 117931203
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6df16b3659f33d85607b74fb7cdd42ccb03ca1d0dc5313a9352883e092924860
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ed5fdd6f9fe5f0de5d43635eeeee3253ccf660833d7fe6d9be640b40bec6bbe
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc835731ce73222513c24c9953cdc95225ff0e18509f3befa431f270d3d03450
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2140,6 +2140,118 @@
|
|
| 2140 |
"eval_samples_per_second": 2.036,
|
| 2141 |
"eval_steps_per_second": 0.509,
|
| 2142 |
"step": 2000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2143 |
}
|
| 2144 |
],
|
| 2145 |
"logging_steps": 10,
|
|
@@ -2159,7 +2271,7 @@
|
|
| 2159 |
"attributes": {}
|
| 2160 |
}
|
| 2161 |
},
|
| 2162 |
-
"total_flos": 3.
|
| 2163 |
"train_batch_size": 1,
|
| 2164 |
"trial_name": null,
|
| 2165 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
+
"epoch": 3.36,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 2100,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2140 |
"eval_samples_per_second": 2.036,
|
| 2141 |
"eval_steps_per_second": 0.509,
|
| 2142 |
"step": 2000
|
| 2143 |
+
},
|
| 2144 |
+
{
|
| 2145 |
+
"entropy": 0.3617474281229079,
|
| 2146 |
+
"epoch": 3.216,
|
| 2147 |
+
"grad_norm": 0.7705036997795105,
|
| 2148 |
+
"learning_rate": 3.5744e-05,
|
| 2149 |
+
"loss": 0.3175,
|
| 2150 |
+
"mean_token_accuracy": 0.9062783475965261,
|
| 2151 |
+
"num_tokens": 20779.0,
|
| 2152 |
+
"step": 2010
|
| 2153 |
+
},
|
| 2154 |
+
{
|
| 2155 |
+
"entropy": 0.3887558562681079,
|
| 2156 |
+
"epoch": 3.232,
|
| 2157 |
+
"grad_norm": 0.9926668405532837,
|
| 2158 |
+
"learning_rate": 3.5424e-05,
|
| 2159 |
+
"loss": 0.3243,
|
| 2160 |
+
"mean_token_accuracy": 0.9048940639942884,
|
| 2161 |
+
"num_tokens": 37039.0,
|
| 2162 |
+
"step": 2020
|
| 2163 |
+
},
|
| 2164 |
+
{
|
| 2165 |
+
"entropy": 0.36308987056836484,
|
| 2166 |
+
"epoch": 3.248,
|
| 2167 |
+
"grad_norm": 0.5336251258850098,
|
| 2168 |
+
"learning_rate": 3.5104e-05,
|
| 2169 |
+
"loss": 0.3286,
|
| 2170 |
+
"mean_token_accuracy": 0.9028704173862934,
|
| 2171 |
+
"num_tokens": 66230.0,
|
| 2172 |
+
"step": 2030
|
| 2173 |
+
},
|
| 2174 |
+
{
|
| 2175 |
+
"entropy": 0.3100855226628482,
|
| 2176 |
+
"epoch": 3.2640000000000002,
|
| 2177 |
+
"grad_norm": 0.6235008239746094,
|
| 2178 |
+
"learning_rate": 3.4784e-05,
|
| 2179 |
+
"loss": 0.3026,
|
| 2180 |
+
"mean_token_accuracy": 0.9074051853269338,
|
| 2181 |
+
"num_tokens": 98315.0,
|
| 2182 |
+
"step": 2040
|
| 2183 |
+
},
|
| 2184 |
+
{
|
| 2185 |
+
"entropy": 0.33463340234011413,
|
| 2186 |
+
"epoch": 3.2800000000000002,
|
| 2187 |
+
"grad_norm": 0.6380220651626587,
|
| 2188 |
+
"learning_rate": 3.4464e-05,
|
| 2189 |
+
"loss": 0.3058,
|
| 2190 |
+
"mean_token_accuracy": 0.9115277793258428,
|
| 2191 |
+
"num_tokens": 123538.0,
|
| 2192 |
+
"step": 2050
|
| 2193 |
+
},
|
| 2194 |
+
{
|
| 2195 |
+
"entropy": 0.3619419479742646,
|
| 2196 |
+
"epoch": 3.296,
|
| 2197 |
+
"grad_norm": 0.7604582905769348,
|
| 2198 |
+
"learning_rate": 3.4144000000000004e-05,
|
| 2199 |
+
"loss": 0.3112,
|
| 2200 |
+
"mean_token_accuracy": 0.9084025923162699,
|
| 2201 |
+
"num_tokens": 143855.0,
|
| 2202 |
+
"step": 2060
|
| 2203 |
+
},
|
| 2204 |
+
{
|
| 2205 |
+
"entropy": 0.3980453579686582,
|
| 2206 |
+
"epoch": 3.312,
|
| 2207 |
+
"grad_norm": 0.8576037883758545,
|
| 2208 |
+
"learning_rate": 3.3824e-05,
|
| 2209 |
+
"loss": 0.3267,
|
| 2210 |
+
"mean_token_accuracy": 0.9037791218608617,
|
| 2211 |
+
"num_tokens": 159314.0,
|
| 2212 |
+
"step": 2070
|
| 2213 |
+
},
|
| 2214 |
+
{
|
| 2215 |
+
"entropy": 0.35077386572957037,
|
| 2216 |
+
"epoch": 3.328,
|
| 2217 |
+
"grad_norm": 0.5504621863365173,
|
| 2218 |
+
"learning_rate": 3.3504e-05,
|
| 2219 |
+
"loss": 0.3004,
|
| 2220 |
+
"mean_token_accuracy": 0.9084354028105736,
|
| 2221 |
+
"num_tokens": 187464.0,
|
| 2222 |
+
"step": 2080
|
| 2223 |
+
},
|
| 2224 |
+
{
|
| 2225 |
+
"entropy": 0.28209723997861147,
|
| 2226 |
+
"epoch": 3.344,
|
| 2227 |
+
"grad_norm": 0.8361979126930237,
|
| 2228 |
+
"learning_rate": 3.3184000000000006e-05,
|
| 2229 |
+
"loss": 0.2903,
|
| 2230 |
+
"mean_token_accuracy": 0.9112230580300092,
|
| 2231 |
+
"num_tokens": 219657.0,
|
| 2232 |
+
"step": 2090
|
| 2233 |
+
},
|
| 2234 |
+
{
|
| 2235 |
+
"entropy": 0.3153431011363864,
|
| 2236 |
+
"epoch": 3.36,
|
| 2237 |
+
"grad_norm": 0.6275749802589417,
|
| 2238 |
+
"learning_rate": 3.2864e-05,
|
| 2239 |
+
"loss": 0.2894,
|
| 2240 |
+
"mean_token_accuracy": 0.9114996068179607,
|
| 2241 |
+
"num_tokens": 245396.0,
|
| 2242 |
+
"step": 2100
|
| 2243 |
+
},
|
| 2244 |
+
{
|
| 2245 |
+
"epoch": 3.36,
|
| 2246 |
+
"eval_accuracy": 0.026501569905019107,
|
| 2247 |
+
"eval_entropy": 0.4113759865760803,
|
| 2248 |
+
"eval_loss": 0.541074275970459,
|
| 2249 |
+
"eval_mean_token_accuracy": 0.8583663606643677,
|
| 2250 |
+
"eval_num_tokens": 245396.0,
|
| 2251 |
+
"eval_runtime": 869.6626,
|
| 2252 |
+
"eval_samples_per_second": 2.3,
|
| 2253 |
+
"eval_steps_per_second": 0.575,
|
| 2254 |
+
"step": 2100
|
| 2255 |
}
|
| 2256 |
],
|
| 2257 |
"logging_steps": 10,
|
|
|
|
| 2271 |
"attributes": {}
|
| 2272 |
}
|
| 2273 |
},
|
| 2274 |
+
"total_flos": 3.6234506980141056e+17,
|
| 2275 |
"train_batch_size": 1,
|
| 2276 |
"trial_name": null,
|
| 2277 |
"trial_params": null
|
last-checkpoint/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6353
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc0c9c43aae96575e8afc416e967ac5674d13cc1a38c487b69cd4534aafef005
|
| 3 |
size 6353
|