Training in progress, step 20000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0634cd3b48faa896331e649d644ee85a0e0af72246ab7393a66a3c2518bb02e
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:611dbdaa20f4f869458e449fe2e70d417e2df56bd8ff59602f5187369567bda1
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd42aefaf8cffc05ebd908742fc863dc5486d9c9296568766959af6a5b7610ad
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6662ae68d38995d5846f13e724946a2acb1395046b7d08977dde3dab733945c0
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3390,11 +3390,189 @@
|
|
| 3390 |
"eval_steps_per_second": 19.01,
|
| 3391 |
"num_input_tokens_seen": 19922944000,
|
| 3392 |
"step": 19000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3393 |
}
|
| 3394 |
],
|
| 3395 |
"logging_steps": 50,
|
| 3396 |
"max_steps": 200000,
|
| 3397 |
-
"num_input_tokens_seen":
|
| 3398 |
"num_train_epochs": 5,
|
| 3399 |
"save_steps": 1000,
|
| 3400 |
"stateful_callbacks": {
|
|
@@ -3409,7 +3587,7 @@
|
|
| 3409 |
"attributes": {}
|
| 3410 |
}
|
| 3411 |
},
|
| 3412 |
-
"total_flos": 1.
|
| 3413 |
"train_batch_size": 64,
|
| 3414 |
"trial_name": null,
|
| 3415 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.43932086485803756,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 20000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3390 |
"eval_steps_per_second": 19.01,
|
| 3391 |
"num_input_tokens_seen": 19922944000,
|
| 3392 |
"step": 19000
|
| 3393 |
+
},
|
| 3394 |
+
{
|
| 3395 |
+
"epoch": 0.4184531237772808,
|
| 3396 |
+
"grad_norm": 0.12389284372329712,
|
| 3397 |
+
"learning_rate": 0.001,
|
| 3398 |
+
"loss": 2.7222,
|
| 3399 |
+
"num_input_tokens_seen": 19975372800,
|
| 3400 |
+
"step": 19050
|
| 3401 |
+
},
|
| 3402 |
+
{
|
| 3403 |
+
"epoch": 0.4195514259394259,
|
| 3404 |
+
"grad_norm": 0.14157339930534363,
|
| 3405 |
+
"learning_rate": 0.001,
|
| 3406 |
+
"loss": 2.7178,
|
| 3407 |
+
"num_input_tokens_seen": 20027801600,
|
| 3408 |
+
"step": 19100
|
| 3409 |
+
},
|
| 3410 |
+
{
|
| 3411 |
+
"epoch": 0.420649728101571,
|
| 3412 |
+
"grad_norm": 0.1490466445684433,
|
| 3413 |
+
"learning_rate": 0.001,
|
| 3414 |
+
"loss": 2.7185,
|
| 3415 |
+
"num_input_tokens_seen": 20080230400,
|
| 3416 |
+
"step": 19150
|
| 3417 |
+
},
|
| 3418 |
+
{
|
| 3419 |
+
"epoch": 0.4217480302637161,
|
| 3420 |
+
"grad_norm": 0.14112494885921478,
|
| 3421 |
+
"learning_rate": 0.001,
|
| 3422 |
+
"loss": 2.7166,
|
| 3423 |
+
"num_input_tokens_seen": 20132659200,
|
| 3424 |
+
"step": 19200
|
| 3425 |
+
},
|
| 3426 |
+
{
|
| 3427 |
+
"epoch": 0.42284633242586117,
|
| 3428 |
+
"grad_norm": 0.13986504077911377,
|
| 3429 |
+
"learning_rate": 0.001,
|
| 3430 |
+
"loss": 2.7201,
|
| 3431 |
+
"num_input_tokens_seen": 20185088000,
|
| 3432 |
+
"step": 19250
|
| 3433 |
+
},
|
| 3434 |
+
{
|
| 3435 |
+
"epoch": 0.42394463458800624,
|
| 3436 |
+
"grad_norm": 0.14087803661823273,
|
| 3437 |
+
"learning_rate": 0.001,
|
| 3438 |
+
"loss": 2.7175,
|
| 3439 |
+
"num_input_tokens_seen": 20237516800,
|
| 3440 |
+
"step": 19300
|
| 3441 |
+
},
|
| 3442 |
+
{
|
| 3443 |
+
"epoch": 0.42504293675015137,
|
| 3444 |
+
"grad_norm": 0.165438711643219,
|
| 3445 |
+
"learning_rate": 0.001,
|
| 3446 |
+
"loss": 2.7155,
|
| 3447 |
+
"num_input_tokens_seen": 20289945600,
|
| 3448 |
+
"step": 19350
|
| 3449 |
+
},
|
| 3450 |
+
{
|
| 3451 |
+
"epoch": 0.42614123891229644,
|
| 3452 |
+
"grad_norm": 0.132109135389328,
|
| 3453 |
+
"learning_rate": 0.001,
|
| 3454 |
+
"loss": 2.7116,
|
| 3455 |
+
"num_input_tokens_seen": 20342374400,
|
| 3456 |
+
"step": 19400
|
| 3457 |
+
},
|
| 3458 |
+
{
|
| 3459 |
+
"epoch": 0.42723954107444156,
|
| 3460 |
+
"grad_norm": 0.1372772753238678,
|
| 3461 |
+
"learning_rate": 0.001,
|
| 3462 |
+
"loss": 2.7137,
|
| 3463 |
+
"num_input_tokens_seen": 20394803200,
|
| 3464 |
+
"step": 19450
|
| 3465 |
+
},
|
| 3466 |
+
{
|
| 3467 |
+
"epoch": 0.42833784323658664,
|
| 3468 |
+
"grad_norm": 0.1470147669315338,
|
| 3469 |
+
"learning_rate": 0.001,
|
| 3470 |
+
"loss": 2.7081,
|
| 3471 |
+
"num_input_tokens_seen": 20447232000,
|
| 3472 |
+
"step": 19500
|
| 3473 |
+
},
|
| 3474 |
+
{
|
| 3475 |
+
"epoch": 0.42833784323658664,
|
| 3476 |
+
"eval_loss": 2.615947961807251,
|
| 3477 |
+
"eval_runtime": 65.588,
|
| 3478 |
+
"eval_samples_per_second": 76.233,
|
| 3479 |
+
"eval_steps_per_second": 19.058,
|
| 3480 |
+
"num_input_tokens_seen": 20447232000,
|
| 3481 |
+
"step": 19500
|
| 3482 |
+
},
|
| 3483 |
+
{
|
| 3484 |
+
"epoch": 0.42943614539873176,
|
| 3485 |
+
"grad_norm": 0.15671676397323608,
|
| 3486 |
+
"learning_rate": 0.001,
|
| 3487 |
+
"loss": 2.7176,
|
| 3488 |
+
"num_input_tokens_seen": 20499660800,
|
| 3489 |
+
"step": 19550
|
| 3490 |
+
},
|
| 3491 |
+
{
|
| 3492 |
+
"epoch": 0.43053444756087683,
|
| 3493 |
+
"grad_norm": 0.13104794919490814,
|
| 3494 |
+
"learning_rate": 0.001,
|
| 3495 |
+
"loss": 2.7108,
|
| 3496 |
+
"num_input_tokens_seen": 20552089600,
|
| 3497 |
+
"step": 19600
|
| 3498 |
+
},
|
| 3499 |
+
{
|
| 3500 |
+
"epoch": 0.4316327497230219,
|
| 3501 |
+
"grad_norm": 0.14532406628131866,
|
| 3502 |
+
"learning_rate": 0.001,
|
| 3503 |
+
"loss": 2.7087,
|
| 3504 |
+
"num_input_tokens_seen": 20604518400,
|
| 3505 |
+
"step": 19650
|
| 3506 |
+
},
|
| 3507 |
+
{
|
| 3508 |
+
"epoch": 0.43273105188516703,
|
| 3509 |
+
"grad_norm": 0.16199354827404022,
|
| 3510 |
+
"learning_rate": 0.001,
|
| 3511 |
+
"loss": 2.7178,
|
| 3512 |
+
"num_input_tokens_seen": 20656947200,
|
| 3513 |
+
"step": 19700
|
| 3514 |
+
},
|
| 3515 |
+
{
|
| 3516 |
+
"epoch": 0.4338293540473121,
|
| 3517 |
+
"grad_norm": 0.13537316024303436,
|
| 3518 |
+
"learning_rate": 0.001,
|
| 3519 |
+
"loss": 2.7124,
|
| 3520 |
+
"num_input_tokens_seen": 20709376000,
|
| 3521 |
+
"step": 19750
|
| 3522 |
+
},
|
| 3523 |
+
{
|
| 3524 |
+
"epoch": 0.4349276562094572,
|
| 3525 |
+
"grad_norm": 0.15098537504673004,
|
| 3526 |
+
"learning_rate": 0.001,
|
| 3527 |
+
"loss": 2.7119,
|
| 3528 |
+
"num_input_tokens_seen": 20761804800,
|
| 3529 |
+
"step": 19800
|
| 3530 |
+
},
|
| 3531 |
+
{
|
| 3532 |
+
"epoch": 0.4360259583716023,
|
| 3533 |
+
"grad_norm": 0.21563659608364105,
|
| 3534 |
+
"learning_rate": 0.001,
|
| 3535 |
+
"loss": 2.7118,
|
| 3536 |
+
"num_input_tokens_seen": 20814233600,
|
| 3537 |
+
"step": 19850
|
| 3538 |
+
},
|
| 3539 |
+
{
|
| 3540 |
+
"epoch": 0.43712426053374737,
|
| 3541 |
+
"grad_norm": 0.15981121361255646,
|
| 3542 |
+
"learning_rate": 0.001,
|
| 3543 |
+
"loss": 2.7043,
|
| 3544 |
+
"num_input_tokens_seen": 20866662400,
|
| 3545 |
+
"step": 19900
|
| 3546 |
+
},
|
| 3547 |
+
{
|
| 3548 |
+
"epoch": 0.4382225626958925,
|
| 3549 |
+
"grad_norm": 0.15192069113254547,
|
| 3550 |
+
"learning_rate": 0.001,
|
| 3551 |
+
"loss": 2.7137,
|
| 3552 |
+
"num_input_tokens_seen": 20919091200,
|
| 3553 |
+
"step": 19950
|
| 3554 |
+
},
|
| 3555 |
+
{
|
| 3556 |
+
"epoch": 0.43932086485803756,
|
| 3557 |
+
"grad_norm": 0.14211437106132507,
|
| 3558 |
+
"learning_rate": 0.001,
|
| 3559 |
+
"loss": 2.7128,
|
| 3560 |
+
"num_input_tokens_seen": 20971520000,
|
| 3561 |
+
"step": 20000
|
| 3562 |
+
},
|
| 3563 |
+
{
|
| 3564 |
+
"epoch": 0.43932086485803756,
|
| 3565 |
+
"eval_loss": 2.611689567565918,
|
| 3566 |
+
"eval_runtime": 66.3456,
|
| 3567 |
+
"eval_samples_per_second": 75.363,
|
| 3568 |
+
"eval_steps_per_second": 18.841,
|
| 3569 |
+
"num_input_tokens_seen": 20971520000,
|
| 3570 |
+
"step": 20000
|
| 3571 |
}
|
| 3572 |
],
|
| 3573 |
"logging_steps": 50,
|
| 3574 |
"max_steps": 200000,
|
| 3575 |
+
"num_input_tokens_seen": 20971520000,
|
| 3576 |
"num_train_epochs": 5,
|
| 3577 |
"save_steps": 1000,
|
| 3578 |
"stateful_callbacks": {
|
|
|
|
| 3587 |
"attributes": {}
|
| 3588 |
}
|
| 3589 |
},
|
| 3590 |
+
"total_flos": 1.194343431929856e+19,
|
| 3591 |
"train_batch_size": 64,
|
| 3592 |
"trial_name": null,
|
| 3593 |
"trial_params": null
|