Training in progress, step 2600, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 257609792
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80575a517640befeb13d8a45ac64f043e3f42763e6b44469f20fcd33684343d7
|
| 3 |
size 257609792
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 515278091
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ccd6a8136d4f05e4f1eceb89bbc99924cd37d33d944767e7fa5a2668db41ad0
|
| 3 |
size 515278091
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4f88a3ed70692d8f05b97617079ec0b41dc17b927a833bcbaa62616274bebe6
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aea2b6a5675bb9dc7d6d847844f168cbc539a3493d586a8e2634d29c173b0f88
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64cd2b31c96e17fa70d4680796990915078e28651e7693ec8503c4c01869ff59
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3400,6 +3400,294 @@
|
|
| 3400 |
"learning_rate": 7.00125e-05,
|
| 3401 |
"loss": 4.224,
|
| 3402 |
"step": 2400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3403 |
}
|
| 3404 |
],
|
| 3405 |
"logging_steps": 5,
|
|
@@ -3419,7 +3707,7 @@
|
|
| 3419 |
"attributes": {}
|
| 3420 |
}
|
| 3421 |
},
|
| 3422 |
-
"total_flos": 2.
|
| 3423 |
"train_batch_size": 8,
|
| 3424 |
"trial_name": null,
|
| 3425 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.7995079950799509,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 2600,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3400 |
"learning_rate": 7.00125e-05,
|
| 3401 |
"loss": 4.224,
|
| 3402 |
"step": 2400
|
| 3403 |
+
},
|
| 3404 |
+
{
|
| 3405 |
+
"epoch": 0.7395448954489545,
|
| 3406 |
+
"grad_norm": 0.35414189100265503,
|
| 3407 |
+
"learning_rate": 6.995e-05,
|
| 3408 |
+
"loss": 4.267,
|
| 3409 |
+
"step": 2405
|
| 3410 |
+
},
|
| 3411 |
+
{
|
| 3412 |
+
"epoch": 0.7410824108241082,
|
| 3413 |
+
"grad_norm": 0.3554304838180542,
|
| 3414 |
+
"learning_rate": 6.988750000000001e-05,
|
| 3415 |
+
"loss": 4.1949,
|
| 3416 |
+
"step": 2410
|
| 3417 |
+
},
|
| 3418 |
+
{
|
| 3419 |
+
"epoch": 0.742619926199262,
|
| 3420 |
+
"grad_norm": 0.3534418046474457,
|
| 3421 |
+
"learning_rate": 6.9825e-05,
|
| 3422 |
+
"loss": 4.2337,
|
| 3423 |
+
"step": 2415
|
| 3424 |
+
},
|
| 3425 |
+
{
|
| 3426 |
+
"epoch": 0.7441574415744158,
|
| 3427 |
+
"grad_norm": 0.36624109745025635,
|
| 3428 |
+
"learning_rate": 6.976250000000001e-05,
|
| 3429 |
+
"loss": 4.2385,
|
| 3430 |
+
"step": 2420
|
| 3431 |
+
},
|
| 3432 |
+
{
|
| 3433 |
+
"epoch": 0.7456949569495694,
|
| 3434 |
+
"grad_norm": 0.3525283932685852,
|
| 3435 |
+
"learning_rate": 6.97e-05,
|
| 3436 |
+
"loss": 4.2799,
|
| 3437 |
+
"step": 2425
|
| 3438 |
+
},
|
| 3439 |
+
{
|
| 3440 |
+
"epoch": 0.7472324723247232,
|
| 3441 |
+
"grad_norm": 0.3583906590938568,
|
| 3442 |
+
"learning_rate": 6.96375e-05,
|
| 3443 |
+
"loss": 4.1917,
|
| 3444 |
+
"step": 2430
|
| 3445 |
+
},
|
| 3446 |
+
{
|
| 3447 |
+
"epoch": 0.748769987699877,
|
| 3448 |
+
"grad_norm": 0.355895459651947,
|
| 3449 |
+
"learning_rate": 6.9575e-05,
|
| 3450 |
+
"loss": 4.2309,
|
| 3451 |
+
"step": 2435
|
| 3452 |
+
},
|
| 3453 |
+
{
|
| 3454 |
+
"epoch": 0.7503075030750308,
|
| 3455 |
+
"grad_norm": 0.3442673981189728,
|
| 3456 |
+
"learning_rate": 6.95125e-05,
|
| 3457 |
+
"loss": 4.2104,
|
| 3458 |
+
"step": 2440
|
| 3459 |
+
},
|
| 3460 |
+
{
|
| 3461 |
+
"epoch": 0.7518450184501845,
|
| 3462 |
+
"grad_norm": 0.35169875621795654,
|
| 3463 |
+
"learning_rate": 6.945000000000001e-05,
|
| 3464 |
+
"loss": 4.2586,
|
| 3465 |
+
"step": 2445
|
| 3466 |
+
},
|
| 3467 |
+
{
|
| 3468 |
+
"epoch": 0.7533825338253383,
|
| 3469 |
+
"grad_norm": 0.36030516028404236,
|
| 3470 |
+
"learning_rate": 6.93875e-05,
|
| 3471 |
+
"loss": 4.2897,
|
| 3472 |
+
"step": 2450
|
| 3473 |
+
},
|
| 3474 |
+
{
|
| 3475 |
+
"epoch": 0.754920049200492,
|
| 3476 |
+
"grad_norm": 0.3696916997432709,
|
| 3477 |
+
"learning_rate": 6.9325e-05,
|
| 3478 |
+
"loss": 4.2314,
|
| 3479 |
+
"step": 2455
|
| 3480 |
+
},
|
| 3481 |
+
{
|
| 3482 |
+
"epoch": 0.7564575645756457,
|
| 3483 |
+
"grad_norm": 0.3628195822238922,
|
| 3484 |
+
"learning_rate": 6.926250000000001e-05,
|
| 3485 |
+
"loss": 4.1903,
|
| 3486 |
+
"step": 2460
|
| 3487 |
+
},
|
| 3488 |
+
{
|
| 3489 |
+
"epoch": 0.7579950799507995,
|
| 3490 |
+
"grad_norm": 0.37186235189437866,
|
| 3491 |
+
"learning_rate": 6.92e-05,
|
| 3492 |
+
"loss": 4.2523,
|
| 3493 |
+
"step": 2465
|
| 3494 |
+
},
|
| 3495 |
+
{
|
| 3496 |
+
"epoch": 0.7595325953259533,
|
| 3497 |
+
"grad_norm": 0.35027140378952026,
|
| 3498 |
+
"learning_rate": 6.91375e-05,
|
| 3499 |
+
"loss": 4.2943,
|
| 3500 |
+
"step": 2470
|
| 3501 |
+
},
|
| 3502 |
+
{
|
| 3503 |
+
"epoch": 0.761070110701107,
|
| 3504 |
+
"grad_norm": 0.3844810128211975,
|
| 3505 |
+
"learning_rate": 6.9075e-05,
|
| 3506 |
+
"loss": 4.2552,
|
| 3507 |
+
"step": 2475
|
| 3508 |
+
},
|
| 3509 |
+
{
|
| 3510 |
+
"epoch": 0.7626076260762608,
|
| 3511 |
+
"grad_norm": 0.35497698187828064,
|
| 3512 |
+
"learning_rate": 6.90125e-05,
|
| 3513 |
+
"loss": 4.2048,
|
| 3514 |
+
"step": 2480
|
| 3515 |
+
},
|
| 3516 |
+
{
|
| 3517 |
+
"epoch": 0.7641451414514145,
|
| 3518 |
+
"grad_norm": 0.35539621114730835,
|
| 3519 |
+
"learning_rate": 6.895000000000001e-05,
|
| 3520 |
+
"loss": 4.3195,
|
| 3521 |
+
"step": 2485
|
| 3522 |
+
},
|
| 3523 |
+
{
|
| 3524 |
+
"epoch": 0.7656826568265682,
|
| 3525 |
+
"grad_norm": 0.36448633670806885,
|
| 3526 |
+
"learning_rate": 6.88875e-05,
|
| 3527 |
+
"loss": 4.2099,
|
| 3528 |
+
"step": 2490
|
| 3529 |
+
},
|
| 3530 |
+
{
|
| 3531 |
+
"epoch": 0.767220172201722,
|
| 3532 |
+
"grad_norm": 0.3572072982788086,
|
| 3533 |
+
"learning_rate": 6.8825e-05,
|
| 3534 |
+
"loss": 4.2637,
|
| 3535 |
+
"step": 2495
|
| 3536 |
+
},
|
| 3537 |
+
{
|
| 3538 |
+
"epoch": 0.7687576875768758,
|
| 3539 |
+
"grad_norm": 0.3543466031551361,
|
| 3540 |
+
"learning_rate": 6.876250000000001e-05,
|
| 3541 |
+
"loss": 4.2106,
|
| 3542 |
+
"step": 2500
|
| 3543 |
+
},
|
| 3544 |
+
{
|
| 3545 |
+
"epoch": 0.7687576875768758,
|
| 3546 |
+
"eval_loss": 4.25692892074585,
|
| 3547 |
+
"eval_runtime": 15.9135,
|
| 3548 |
+
"eval_samples_per_second": 62.84,
|
| 3549 |
+
"eval_steps_per_second": 3.959,
|
| 3550 |
+
"step": 2500
|
| 3551 |
+
},
|
| 3552 |
+
{
|
| 3553 |
+
"epoch": 0.7702952029520295,
|
| 3554 |
+
"grad_norm": 0.370313823223114,
|
| 3555 |
+
"learning_rate": 6.87e-05,
|
| 3556 |
+
"loss": 4.3354,
|
| 3557 |
+
"step": 2505
|
| 3558 |
+
},
|
| 3559 |
+
{
|
| 3560 |
+
"epoch": 0.7718327183271833,
|
| 3561 |
+
"grad_norm": 0.3540562391281128,
|
| 3562 |
+
"learning_rate": 6.86375e-05,
|
| 3563 |
+
"loss": 4.2414,
|
| 3564 |
+
"step": 2510
|
| 3565 |
+
},
|
| 3566 |
+
{
|
| 3567 |
+
"epoch": 0.773370233702337,
|
| 3568 |
+
"grad_norm": 0.3575718402862549,
|
| 3569 |
+
"learning_rate": 6.8575e-05,
|
| 3570 |
+
"loss": 4.1738,
|
| 3571 |
+
"step": 2515
|
| 3572 |
+
},
|
| 3573 |
+
{
|
| 3574 |
+
"epoch": 0.7749077490774908,
|
| 3575 |
+
"grad_norm": 0.36341428756713867,
|
| 3576 |
+
"learning_rate": 6.85125e-05,
|
| 3577 |
+
"loss": 4.2806,
|
| 3578 |
+
"step": 2520
|
| 3579 |
+
},
|
| 3580 |
+
{
|
| 3581 |
+
"epoch": 0.7764452644526445,
|
| 3582 |
+
"grad_norm": 0.3755071759223938,
|
| 3583 |
+
"learning_rate": 6.845e-05,
|
| 3584 |
+
"loss": 4.2433,
|
| 3585 |
+
"step": 2525
|
| 3586 |
+
},
|
| 3587 |
+
{
|
| 3588 |
+
"epoch": 0.7779827798277983,
|
| 3589 |
+
"grad_norm": 0.3563622236251831,
|
| 3590 |
+
"learning_rate": 6.83875e-05,
|
| 3591 |
+
"loss": 4.2373,
|
| 3592 |
+
"step": 2530
|
| 3593 |
+
},
|
| 3594 |
+
{
|
| 3595 |
+
"epoch": 0.7795202952029521,
|
| 3596 |
+
"grad_norm": 0.3693353235721588,
|
| 3597 |
+
"learning_rate": 6.832500000000001e-05,
|
| 3598 |
+
"loss": 4.2185,
|
| 3599 |
+
"step": 2535
|
| 3600 |
+
},
|
| 3601 |
+
{
|
| 3602 |
+
"epoch": 0.7810578105781057,
|
| 3603 |
+
"grad_norm": 0.3789558708667755,
|
| 3604 |
+
"learning_rate": 6.826250000000001e-05,
|
| 3605 |
+
"loss": 4.2421,
|
| 3606 |
+
"step": 2540
|
| 3607 |
+
},
|
| 3608 |
+
{
|
| 3609 |
+
"epoch": 0.7825953259532595,
|
| 3610 |
+
"grad_norm": 0.36780837178230286,
|
| 3611 |
+
"learning_rate": 6.82e-05,
|
| 3612 |
+
"loss": 4.258,
|
| 3613 |
+
"step": 2545
|
| 3614 |
+
},
|
| 3615 |
+
{
|
| 3616 |
+
"epoch": 0.7841328413284133,
|
| 3617 |
+
"grad_norm": 0.3676084280014038,
|
| 3618 |
+
"learning_rate": 6.81375e-05,
|
| 3619 |
+
"loss": 4.3125,
|
| 3620 |
+
"step": 2550
|
| 3621 |
+
},
|
| 3622 |
+
{
|
| 3623 |
+
"epoch": 0.785670356703567,
|
| 3624 |
+
"grad_norm": 0.3575945496559143,
|
| 3625 |
+
"learning_rate": 6.8075e-05,
|
| 3626 |
+
"loss": 4.2462,
|
| 3627 |
+
"step": 2555
|
| 3628 |
+
},
|
| 3629 |
+
{
|
| 3630 |
+
"epoch": 0.7872078720787208,
|
| 3631 |
+
"grad_norm": 0.36073750257492065,
|
| 3632 |
+
"learning_rate": 6.80125e-05,
|
| 3633 |
+
"loss": 4.1678,
|
| 3634 |
+
"step": 2560
|
| 3635 |
+
},
|
| 3636 |
+
{
|
| 3637 |
+
"epoch": 0.7887453874538746,
|
| 3638 |
+
"grad_norm": 0.35818690061569214,
|
| 3639 |
+
"learning_rate": 6.795e-05,
|
| 3640 |
+
"loss": 4.3244,
|
| 3641 |
+
"step": 2565
|
| 3642 |
+
},
|
| 3643 |
+
{
|
| 3644 |
+
"epoch": 0.7902829028290282,
|
| 3645 |
+
"grad_norm": 0.353287935256958,
|
| 3646 |
+
"learning_rate": 6.78875e-05,
|
| 3647 |
+
"loss": 4.2217,
|
| 3648 |
+
"step": 2570
|
| 3649 |
+
},
|
| 3650 |
+
{
|
| 3651 |
+
"epoch": 0.791820418204182,
|
| 3652 |
+
"grad_norm": 0.3640352785587311,
|
| 3653 |
+
"learning_rate": 6.782500000000001e-05,
|
| 3654 |
+
"loss": 4.2434,
|
| 3655 |
+
"step": 2575
|
| 3656 |
+
},
|
| 3657 |
+
{
|
| 3658 |
+
"epoch": 0.7933579335793358,
|
| 3659 |
+
"grad_norm": 0.37529149651527405,
|
| 3660 |
+
"learning_rate": 6.77625e-05,
|
| 3661 |
+
"loss": 4.2322,
|
| 3662 |
+
"step": 2580
|
| 3663 |
+
},
|
| 3664 |
+
{
|
| 3665 |
+
"epoch": 0.7948954489544895,
|
| 3666 |
+
"grad_norm": 0.3544490337371826,
|
| 3667 |
+
"learning_rate": 6.77e-05,
|
| 3668 |
+
"loss": 4.2592,
|
| 3669 |
+
"step": 2585
|
| 3670 |
+
},
|
| 3671 |
+
{
|
| 3672 |
+
"epoch": 0.7964329643296433,
|
| 3673 |
+
"grad_norm": 0.3656282424926758,
|
| 3674 |
+
"learning_rate": 6.76375e-05,
|
| 3675 |
+
"loss": 4.2829,
|
| 3676 |
+
"step": 2590
|
| 3677 |
+
},
|
| 3678 |
+
{
|
| 3679 |
+
"epoch": 0.7979704797047971,
|
| 3680 |
+
"grad_norm": 0.3591875731945038,
|
| 3681 |
+
"learning_rate": 6.7575e-05,
|
| 3682 |
+
"loss": 4.3061,
|
| 3683 |
+
"step": 2595
|
| 3684 |
+
},
|
| 3685 |
+
{
|
| 3686 |
+
"epoch": 0.7995079950799509,
|
| 3687 |
+
"grad_norm": 0.3518073260784149,
|
| 3688 |
+
"learning_rate": 6.75125e-05,
|
| 3689 |
+
"loss": 4.2296,
|
| 3690 |
+
"step": 2600
|
| 3691 |
}
|
| 3692 |
],
|
| 3693 |
"logging_steps": 5,
|
|
|
|
| 3707 |
"attributes": {}
|
| 3708 |
}
|
| 3709 |
},
|
| 3710 |
+
"total_flos": 2.45447918616576e+16,
|
| 3711 |
"train_batch_size": 8,
|
| 3712 |
"trial_name": null,
|
| 3713 |
"trial_params": null
|