Training in progress, step 3125, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 228140600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31e65c9ff039c74d59b4607524385f75a8ae083b148b3a163cece010a9774af0
|
| 3 |
size 228140600
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 117931203
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b96216027c02e20a6ee8541060ecd0085b74fd0ea5669cf82258347c31d3baf
|
| 3 |
size 117931203
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e788bee1c067926ef11645e418ec428402ec185fb9258e04df56296e42d2286b
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e230928162c4463d462e64ab14b3906988dfebe47926d517a84f2e81ec7582c
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b0095603c7ffc8d3152c5de9d397fd1beca2e9651bdba9b9da9fbad8a37e19c
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3188,6 +3188,126 @@
|
|
| 3188 |
"eval_samples_per_second": 2.07,
|
| 3189 |
"eval_steps_per_second": 0.518,
|
| 3190 |
"step": 3000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3191 |
}
|
| 3192 |
],
|
| 3193 |
"logging_steps": 10,
|
|
@@ -3202,12 +3322,12 @@
|
|
| 3202 |
"should_evaluate": false,
|
| 3203 |
"should_log": false,
|
| 3204 |
"should_save": true,
|
| 3205 |
-
"should_training_stop":
|
| 3206 |
},
|
| 3207 |
"attributes": {}
|
| 3208 |
}
|
| 3209 |
},
|
| 3210 |
-
"total_flos": 5.
|
| 3211 |
"train_batch_size": 1,
|
| 3212 |
"trial_name": null,
|
| 3213 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
+
"epoch": 5.0,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 3125,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3188 |
"eval_samples_per_second": 2.07,
|
| 3189 |
"eval_steps_per_second": 0.518,
|
| 3190 |
"step": 3000
|
| 3191 |
+
},
|
| 3192 |
+
{
|
| 3193 |
+
"entropy": 0.19047842593863606,
|
| 3194 |
+
"epoch": 4.816,
|
| 3195 |
+
"grad_norm": 0.8224709033966064,
|
| 3196 |
+
"learning_rate": 3.8080000000000006e-06,
|
| 3197 |
+
"loss": 0.1691,
|
| 3198 |
+
"mean_token_accuracy": 0.9483149264007806,
|
| 3199 |
+
"num_tokens": 785457.0,
|
| 3200 |
+
"step": 3010
|
| 3201 |
+
},
|
| 3202 |
+
{
|
| 3203 |
+
"entropy": 0.1947814745362848,
|
| 3204 |
+
"epoch": 4.832,
|
| 3205 |
+
"grad_norm": 0.8581233024597168,
|
| 3206 |
+
"learning_rate": 3.4880000000000003e-06,
|
| 3207 |
+
"loss": 0.1535,
|
| 3208 |
+
"mean_token_accuracy": 0.9543764512985945,
|
| 3209 |
+
"num_tokens": 814006.0,
|
| 3210 |
+
"step": 3020
|
| 3211 |
+
},
|
| 3212 |
+
{
|
| 3213 |
+
"entropy": 0.20228669252246617,
|
| 3214 |
+
"epoch": 4.848,
|
| 3215 |
+
"grad_norm": 0.7815537452697754,
|
| 3216 |
+
"learning_rate": 3.168e-06,
|
| 3217 |
+
"loss": 0.1539,
|
| 3218 |
+
"mean_token_accuracy": 0.9561178237199783,
|
| 3219 |
+
"num_tokens": 836843.0,
|
| 3220 |
+
"step": 3030
|
| 3221 |
+
},
|
| 3222 |
+
{
|
| 3223 |
+
"entropy": 0.2111768877133727,
|
| 3224 |
+
"epoch": 4.864,
|
| 3225 |
+
"grad_norm": 2.0849273204803467,
|
| 3226 |
+
"learning_rate": 2.848e-06,
|
| 3227 |
+
"loss": 0.1553,
|
| 3228 |
+
"mean_token_accuracy": 0.9579557087272406,
|
| 3229 |
+
"num_tokens": 855036.0,
|
| 3230 |
+
"step": 3040
|
| 3231 |
+
},
|
| 3232 |
+
{
|
| 3233 |
+
"entropy": 0.2543737689033151,
|
| 3234 |
+
"epoch": 4.88,
|
| 3235 |
+
"grad_norm": 0.9005395770072937,
|
| 3236 |
+
"learning_rate": 2.528e-06,
|
| 3237 |
+
"loss": 0.18,
|
| 3238 |
+
"mean_token_accuracy": 0.951928498968482,
|
| 3239 |
+
"num_tokens": 867473.0,
|
| 3240 |
+
"step": 3050
|
| 3241 |
+
},
|
| 3242 |
+
{
|
| 3243 |
+
"entropy": 0.19695296385325492,
|
| 3244 |
+
"epoch": 4.896,
|
| 3245 |
+
"grad_norm": 0.8913720846176147,
|
| 3246 |
+
"learning_rate": 2.208e-06,
|
| 3247 |
+
"loss": 0.1731,
|
| 3248 |
+
"mean_token_accuracy": 0.9454629000276327,
|
| 3249 |
+
"num_tokens": 905517.0,
|
| 3250 |
+
"step": 3060
|
| 3251 |
+
},
|
| 3252 |
+
{
|
| 3253 |
+
"entropy": 0.2020930268801749,
|
| 3254 |
+
"epoch": 4.912,
|
| 3255 |
+
"grad_norm": 1.0501484870910645,
|
| 3256 |
+
"learning_rate": 1.8880000000000002e-06,
|
| 3257 |
+
"loss": 0.1583,
|
| 3258 |
+
"mean_token_accuracy": 0.954399960488081,
|
| 3259 |
+
"num_tokens": 933251.0,
|
| 3260 |
+
"step": 3070
|
| 3261 |
+
},
|
| 3262 |
+
{
|
| 3263 |
+
"entropy": 0.20252155787311493,
|
| 3264 |
+
"epoch": 4.928,
|
| 3265 |
+
"grad_norm": 1.03731369972229,
|
| 3266 |
+
"learning_rate": 1.568e-06,
|
| 3267 |
+
"loss": 0.1531,
|
| 3268 |
+
"mean_token_accuracy": 0.9579384963959455,
|
| 3269 |
+
"num_tokens": 956069.0,
|
| 3270 |
+
"step": 3080
|
| 3271 |
+
},
|
| 3272 |
+
{
|
| 3273 |
+
"entropy": 0.2126692888326943,
|
| 3274 |
+
"epoch": 4.944,
|
| 3275 |
+
"grad_norm": 1.107572317123413,
|
| 3276 |
+
"learning_rate": 1.248e-06,
|
| 3277 |
+
"loss": 0.1568,
|
| 3278 |
+
"mean_token_accuracy": 0.9569063678383827,
|
| 3279 |
+
"num_tokens": 974517.0,
|
| 3280 |
+
"step": 3090
|
| 3281 |
+
},
|
| 3282 |
+
{
|
| 3283 |
+
"entropy": 0.24990466320887209,
|
| 3284 |
+
"epoch": 4.96,
|
| 3285 |
+
"grad_norm": 1.2767953872680664,
|
| 3286 |
+
"learning_rate": 9.28e-07,
|
| 3287 |
+
"loss": 0.1851,
|
| 3288 |
+
"mean_token_accuracy": 0.9518057998269797,
|
| 3289 |
+
"num_tokens": 987191.0,
|
| 3290 |
+
"step": 3100
|
| 3291 |
+
},
|
| 3292 |
+
{
|
| 3293 |
+
"entropy": 0.19635155922733247,
|
| 3294 |
+
"epoch": 4.976,
|
| 3295 |
+
"grad_norm": 0.838716447353363,
|
| 3296 |
+
"learning_rate": 6.08e-07,
|
| 3297 |
+
"loss": 0.1689,
|
| 3298 |
+
"mean_token_accuracy": 0.9492763552814723,
|
| 3299 |
+
"num_tokens": 1021442.0,
|
| 3300 |
+
"step": 3110
|
| 3301 |
+
},
|
| 3302 |
+
{
|
| 3303 |
+
"entropy": 0.21572725460864603,
|
| 3304 |
+
"epoch": 4.992,
|
| 3305 |
+
"grad_norm": 0.9043759107589722,
|
| 3306 |
+
"learning_rate": 2.8800000000000004e-07,
|
| 3307 |
+
"loss": 0.161,
|
| 3308 |
+
"mean_token_accuracy": 0.9549260966479778,
|
| 3309 |
+
"num_tokens": 1041350.0,
|
| 3310 |
+
"step": 3120
|
| 3311 |
}
|
| 3312 |
],
|
| 3313 |
"logging_steps": 10,
|
|
|
|
| 3322 |
"should_evaluate": false,
|
| 3323 |
"should_log": false,
|
| 3324 |
"should_save": true,
|
| 3325 |
+
"should_training_stop": true
|
| 3326 |
},
|
| 3327 |
"attributes": {}
|
| 3328 |
}
|
| 3329 |
},
|
| 3330 |
+
"total_flos": 5.37035906398464e+17,
|
| 3331 |
"train_batch_size": 1,
|
| 3332 |
"trial_name": null,
|
| 3333 |
"trial_params": null
|