Training in progress, step 19000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50692c69fe3ea90614dc625956890e6dd059a4900ffb733cb441c9d9b0be1ed6
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f34ad85e7a64410399bc0984c1c1c25765a6659574c5d382b0c132a27be2f0f8
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09184de0af072dcf6f15e331e61deb81a6900d407b5c7ebcb519d56082f36e97
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02c3d80aaacee80212417a329afbc88c74b35bad8004900a2301b44b629b4ab7
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3212,11 +3212,189 @@
|
|
| 3212 |
"eval_steps_per_second": 19.114,
|
| 3213 |
"num_input_tokens_seen": 18874368000,
|
| 3214 |
"step": 18000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3215 |
}
|
| 3216 |
],
|
| 3217 |
"logging_steps": 50,
|
| 3218 |
"max_steps": 200000,
|
| 3219 |
-
"num_input_tokens_seen":
|
| 3220 |
"num_train_epochs": 5,
|
| 3221 |
"save_steps": 1000,
|
| 3222 |
"stateful_callbacks": {
|
|
@@ -3231,7 +3409,7 @@
|
|
| 3231 |
"attributes": {}
|
| 3232 |
}
|
| 3233 |
},
|
| 3234 |
-
"total_flos": 1.
|
| 3235 |
"train_batch_size": 64,
|
| 3236 |
"trial_name": null,
|
| 3237 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4173548216151357,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 19000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3212 |
"eval_steps_per_second": 19.114,
|
| 3213 |
"num_input_tokens_seen": 18874368000,
|
| 3214 |
"step": 18000
|
| 3215 |
+
},
|
| 3216 |
+
{
|
| 3217 |
+
"epoch": 0.3964870805343789,
|
| 3218 |
+
"grad_norm": 0.1353403478860855,
|
| 3219 |
+
"learning_rate": 0.001,
|
| 3220 |
+
"loss": 2.724,
|
| 3221 |
+
"num_input_tokens_seen": 18926796800,
|
| 3222 |
+
"step": 18050
|
| 3223 |
+
},
|
| 3224 |
+
{
|
| 3225 |
+
"epoch": 0.397585382696524,
|
| 3226 |
+
"grad_norm": 0.15004459023475647,
|
| 3227 |
+
"learning_rate": 0.001,
|
| 3228 |
+
"loss": 2.717,
|
| 3229 |
+
"num_input_tokens_seen": 18979225600,
|
| 3230 |
+
"step": 18100
|
| 3231 |
+
},
|
| 3232 |
+
{
|
| 3233 |
+
"epoch": 0.3986836848586691,
|
| 3234 |
+
"grad_norm": 0.1293007880449295,
|
| 3235 |
+
"learning_rate": 0.001,
|
| 3236 |
+
"loss": 2.7187,
|
| 3237 |
+
"num_input_tokens_seen": 19031654400,
|
| 3238 |
+
"step": 18150
|
| 3239 |
+
},
|
| 3240 |
+
{
|
| 3241 |
+
"epoch": 0.3997819870208142,
|
| 3242 |
+
"grad_norm": 0.16373878717422485,
|
| 3243 |
+
"learning_rate": 0.001,
|
| 3244 |
+
"loss": 2.7217,
|
| 3245 |
+
"num_input_tokens_seen": 19084083200,
|
| 3246 |
+
"step": 18200
|
| 3247 |
+
},
|
| 3248 |
+
{
|
| 3249 |
+
"epoch": 0.4008802891829593,
|
| 3250 |
+
"grad_norm": 0.1529611349105835,
|
| 3251 |
+
"learning_rate": 0.001,
|
| 3252 |
+
"loss": 2.722,
|
| 3253 |
+
"num_input_tokens_seen": 19136512000,
|
| 3254 |
+
"step": 18250
|
| 3255 |
+
},
|
| 3256 |
+
{
|
| 3257 |
+
"epoch": 0.4019785913451044,
|
| 3258 |
+
"grad_norm": 0.14109951257705688,
|
| 3259 |
+
"learning_rate": 0.001,
|
| 3260 |
+
"loss": 2.7232,
|
| 3261 |
+
"num_input_tokens_seen": 19188940800,
|
| 3262 |
+
"step": 18300
|
| 3263 |
+
},
|
| 3264 |
+
{
|
| 3265 |
+
"epoch": 0.40307689350724946,
|
| 3266 |
+
"grad_norm": 0.13841493427753448,
|
| 3267 |
+
"learning_rate": 0.001,
|
| 3268 |
+
"loss": 2.7195,
|
| 3269 |
+
"num_input_tokens_seen": 19241369600,
|
| 3270 |
+
"step": 18350
|
| 3271 |
+
},
|
| 3272 |
+
{
|
| 3273 |
+
"epoch": 0.4041751956693946,
|
| 3274 |
+
"grad_norm": 0.13508476316928864,
|
| 3275 |
+
"learning_rate": 0.001,
|
| 3276 |
+
"loss": 2.7166,
|
| 3277 |
+
"num_input_tokens_seen": 19293798400,
|
| 3278 |
+
"step": 18400
|
| 3279 |
+
},
|
| 3280 |
+
{
|
| 3281 |
+
"epoch": 0.40527349783153965,
|
| 3282 |
+
"grad_norm": 0.1372646540403366,
|
| 3283 |
+
"learning_rate": 0.001,
|
| 3284 |
+
"loss": 2.7212,
|
| 3285 |
+
"num_input_tokens_seen": 19346227200,
|
| 3286 |
+
"step": 18450
|
| 3287 |
+
},
|
| 3288 |
+
{
|
| 3289 |
+
"epoch": 0.4063717999936848,
|
| 3290 |
+
"grad_norm": 0.1485033482313156,
|
| 3291 |
+
"learning_rate": 0.001,
|
| 3292 |
+
"loss": 2.7186,
|
| 3293 |
+
"num_input_tokens_seen": 19398656000,
|
| 3294 |
+
"step": 18500
|
| 3295 |
+
},
|
| 3296 |
+
{
|
| 3297 |
+
"epoch": 0.4063717999936848,
|
| 3298 |
+
"eval_loss": 2.622330904006958,
|
| 3299 |
+
"eval_runtime": 66.3601,
|
| 3300 |
+
"eval_samples_per_second": 75.346,
|
| 3301 |
+
"eval_steps_per_second": 18.837,
|
| 3302 |
+
"num_input_tokens_seen": 19398656000,
|
| 3303 |
+
"step": 18500
|
| 3304 |
+
},
|
| 3305 |
+
{
|
| 3306 |
+
"epoch": 0.40747010215582985,
|
| 3307 |
+
"grad_norm": 0.1484711617231369,
|
| 3308 |
+
"learning_rate": 0.001,
|
| 3309 |
+
"loss": 2.7235,
|
| 3310 |
+
"num_input_tokens_seen": 19451084800,
|
| 3311 |
+
"step": 18550
|
| 3312 |
+
},
|
| 3313 |
+
{
|
| 3314 |
+
"epoch": 0.408568404317975,
|
| 3315 |
+
"grad_norm": 0.141770601272583,
|
| 3316 |
+
"learning_rate": 0.001,
|
| 3317 |
+
"loss": 2.7225,
|
| 3318 |
+
"num_input_tokens_seen": 19503513600,
|
| 3319 |
+
"step": 18600
|
| 3320 |
+
},
|
| 3321 |
+
{
|
| 3322 |
+
"epoch": 0.40966670648012005,
|
| 3323 |
+
"grad_norm": 0.1213323250412941,
|
| 3324 |
+
"learning_rate": 0.001,
|
| 3325 |
+
"loss": 2.7212,
|
| 3326 |
+
"num_input_tokens_seen": 19555942400,
|
| 3327 |
+
"step": 18650
|
| 3328 |
+
},
|
| 3329 |
+
{
|
| 3330 |
+
"epoch": 0.4107650086422651,
|
| 3331 |
+
"grad_norm": 0.14149373769760132,
|
| 3332 |
+
"learning_rate": 0.001,
|
| 3333 |
+
"loss": 2.7181,
|
| 3334 |
+
"num_input_tokens_seen": 19608371200,
|
| 3335 |
+
"step": 18700
|
| 3336 |
+
},
|
| 3337 |
+
{
|
| 3338 |
+
"epoch": 0.41186331080441024,
|
| 3339 |
+
"grad_norm": 0.13964049518108368,
|
| 3340 |
+
"learning_rate": 0.001,
|
| 3341 |
+
"loss": 2.7147,
|
| 3342 |
+
"num_input_tokens_seen": 19660800000,
|
| 3343 |
+
"step": 18750
|
| 3344 |
+
},
|
| 3345 |
+
{
|
| 3346 |
+
"epoch": 0.4129616129665553,
|
| 3347 |
+
"grad_norm": 0.1384592205286026,
|
| 3348 |
+
"learning_rate": 0.001,
|
| 3349 |
+
"loss": 2.7141,
|
| 3350 |
+
"num_input_tokens_seen": 19713228800,
|
| 3351 |
+
"step": 18800
|
| 3352 |
+
},
|
| 3353 |
+
{
|
| 3354 |
+
"epoch": 0.41405991512870044,
|
| 3355 |
+
"grad_norm": 0.15027381479740143,
|
| 3356 |
+
"learning_rate": 0.001,
|
| 3357 |
+
"loss": 2.7185,
|
| 3358 |
+
"num_input_tokens_seen": 19765657600,
|
| 3359 |
+
"step": 18850
|
| 3360 |
+
},
|
| 3361 |
+
{
|
| 3362 |
+
"epoch": 0.4151582172908455,
|
| 3363 |
+
"grad_norm": 0.15221597254276276,
|
| 3364 |
+
"learning_rate": 0.001,
|
| 3365 |
+
"loss": 2.7206,
|
| 3366 |
+
"num_input_tokens_seen": 19818086400,
|
| 3367 |
+
"step": 18900
|
| 3368 |
+
},
|
| 3369 |
+
{
|
| 3370 |
+
"epoch": 0.4162565194529906,
|
| 3371 |
+
"grad_norm": 0.1272735893726349,
|
| 3372 |
+
"learning_rate": 0.001,
|
| 3373 |
+
"loss": 2.7183,
|
| 3374 |
+
"num_input_tokens_seen": 19870515200,
|
| 3375 |
+
"step": 18950
|
| 3376 |
+
},
|
| 3377 |
+
{
|
| 3378 |
+
"epoch": 0.4173548216151357,
|
| 3379 |
+
"grad_norm": 0.1258268654346466,
|
| 3380 |
+
"learning_rate": 0.001,
|
| 3381 |
+
"loss": 2.7117,
|
| 3382 |
+
"num_input_tokens_seen": 19922944000,
|
| 3383 |
+
"step": 19000
|
| 3384 |
+
},
|
| 3385 |
+
{
|
| 3386 |
+
"epoch": 0.4173548216151357,
|
| 3387 |
+
"eval_loss": 2.619187116622925,
|
| 3388 |
+
"eval_runtime": 65.7537,
|
| 3389 |
+
"eval_samples_per_second": 76.041,
|
| 3390 |
+
"eval_steps_per_second": 19.01,
|
| 3391 |
+
"num_input_tokens_seen": 19922944000,
|
| 3392 |
+
"step": 19000
|
| 3393 |
}
|
| 3394 |
],
|
| 3395 |
"logging_steps": 50,
|
| 3396 |
"max_steps": 200000,
|
| 3397 |
+
"num_input_tokens_seen": 19922944000,
|
| 3398 |
"num_train_epochs": 5,
|
| 3399 |
"save_steps": 1000,
|
| 3400 |
"stateful_callbacks": {
|
|
|
|
| 3409 |
"attributes": {}
|
| 3410 |
}
|
| 3411 |
},
|
| 3412 |
+
"total_flos": 1.1346262603333632e+19,
|
| 3413 |
"train_batch_size": 64,
|
| 3414 |
"trial_name": null,
|
| 3415 |
"trial_params": null
|