Training in progress, step 43000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 563074920
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f40af4f458fcf98ff975a8a67bf6d8f825776f93c4fc893bfff9e777a429186
|
| 3 |
size 563074920
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1125916346
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2806be43a2dbc11749a984f9f27c3c727021a3459f016ea43cf735de07b8e8b
|
| 3 |
size 1125916346
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4616c29adbb72fca86a53186f80355f9390c75c85ef3660d2db8c34d983194a4
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baf398bf7bc1350249be7408612b67c8ebc4068beabac28de92ab798dacce92e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7484,11 +7484,189 @@
|
|
| 7484 |
"eval_steps_per_second": 8.724,
|
| 7485 |
"num_input_tokens_seen": 11010048000,
|
| 7486 |
"step": 42000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7487 |
}
|
| 7488 |
],
|
| 7489 |
"logging_steps": 50,
|
| 7490 |
"max_steps": 60000,
|
| 7491 |
-
"num_input_tokens_seen":
|
| 7492 |
"num_train_epochs": 1,
|
| 7493 |
"save_steps": 1000,
|
| 7494 |
"stateful_callbacks": {
|
|
@@ -7503,7 +7681,7 @@
|
|
| 7503 |
"attributes": {}
|
| 7504 |
}
|
| 7505 |
},
|
| 7506 |
-
"total_flos": 7.
|
| 7507 |
"train_batch_size": 64,
|
| 7508 |
"trial_name": null,
|
| 7509 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2892394515885468,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 43000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7484 |
"eval_steps_per_second": 8.724,
|
| 7485 |
"num_input_tokens_seen": 11010048000,
|
| 7486 |
"step": 42000
|
| 7487 |
+
},
|
| 7488 |
+
{
|
| 7489 |
+
"epoch": 0.28284927765810214,
|
| 7490 |
+
"grad_norm": 0.6817448735237122,
|
| 7491 |
+
"learning_rate": 0.0005871683190370497,
|
| 7492 |
+
"loss": 12.0507,
|
| 7493 |
+
"num_input_tokens_seen": 11023155200,
|
| 7494 |
+
"step": 42050
|
| 7495 |
+
},
|
| 7496 |
+
{
|
| 7497 |
+
"epoch": 0.28318560260180975,
|
| 7498 |
+
"grad_norm": 1.443415641784668,
|
| 7499 |
+
"learning_rate": 0.0005864706724332221,
|
| 7500 |
+
"loss": 12.0804,
|
| 7501 |
+
"num_input_tokens_seen": 11036262400,
|
| 7502 |
+
"step": 42100
|
| 7503 |
+
},
|
| 7504 |
+
{
|
| 7505 |
+
"epoch": 0.28352192754551736,
|
| 7506 |
+
"grad_norm": 0.7497735619544983,
|
| 7507 |
+
"learning_rate": 0.0005857549961807582,
|
| 7508 |
+
"loss": 12.1135,
|
| 7509 |
+
"num_input_tokens_seen": 11049369600,
|
| 7510 |
+
"step": 42150
|
| 7511 |
+
},
|
| 7512 |
+
{
|
| 7513 |
+
"epoch": 0.28385825248922497,
|
| 7514 |
+
"grad_norm": 0.7141171097755432,
|
| 7515 |
+
"learning_rate": 0.0005850213353222835,
|
| 7516 |
+
"loss": 12.0707,
|
| 7517 |
+
"num_input_tokens_seen": 11062476800,
|
| 7518 |
+
"step": 42200
|
| 7519 |
+
},
|
| 7520 |
+
{
|
| 7521 |
+
"epoch": 0.2841945774329326,
|
| 7522 |
+
"grad_norm": 0.6800997257232666,
|
| 7523 |
+
"learning_rate": 0.0005842697360323246,
|
| 7524 |
+
"loss": 12.0946,
|
| 7525 |
+
"num_input_tokens_seen": 11075584000,
|
| 7526 |
+
"step": 42250
|
| 7527 |
+
},
|
| 7528 |
+
{
|
| 7529 |
+
"epoch": 0.2845309023766402,
|
| 7530 |
+
"grad_norm": 0.6729973554611206,
|
| 7531 |
+
"learning_rate": 0.0005835002456144005,
|
| 7532 |
+
"loss": 12.0882,
|
| 7533 |
+
"num_input_tokens_seen": 11088691200,
|
| 7534 |
+
"step": 42300
|
| 7535 |
+
},
|
| 7536 |
+
{
|
| 7537 |
+
"epoch": 0.2848672273203478,
|
| 7538 |
+
"grad_norm": 0.715886116027832,
|
| 7539 |
+
"learning_rate": 0.0005827129124980481,
|
| 7540 |
+
"loss": 12.0713,
|
| 7541 |
+
"num_input_tokens_seen": 11101798400,
|
| 7542 |
+
"step": 42350
|
| 7543 |
+
},
|
| 7544 |
+
{
|
| 7545 |
+
"epoch": 0.2852035522640554,
|
| 7546 |
+
"grad_norm": 0.7392980456352234,
|
| 7547 |
+
"learning_rate": 0.0005819077862357724,
|
| 7548 |
+
"loss": 12.0934,
|
| 7549 |
+
"num_input_tokens_seen": 11114905600,
|
| 7550 |
+
"step": 42400
|
| 7551 |
+
},
|
| 7552 |
+
{
|
| 7553 |
+
"epoch": 0.285539877207763,
|
| 7554 |
+
"grad_norm": 0.7118540406227112,
|
| 7555 |
+
"learning_rate": 0.0005810849174999285,
|
| 7556 |
+
"loss": 12.0531,
|
| 7557 |
+
"num_input_tokens_seen": 11128012800,
|
| 7558 |
+
"step": 42450
|
| 7559 |
+
},
|
| 7560 |
+
{
|
| 7561 |
+
"epoch": 0.28587620215147064,
|
| 7562 |
+
"grad_norm": 0.6643871665000916,
|
| 7563 |
+
"learning_rate": 0.000580244358079532,
|
| 7564 |
+
"loss": 12.0812,
|
| 7565 |
+
"num_input_tokens_seen": 11141120000,
|
| 7566 |
+
"step": 42500
|
| 7567 |
+
},
|
| 7568 |
+
{
|
| 7569 |
+
"epoch": 0.28587620215147064,
|
| 7570 |
+
"eval_loss": 2.9250741004943848,
|
| 7571 |
+
"eval_runtime": 143.6479,
|
| 7572 |
+
"eval_samples_per_second": 34.807,
|
| 7573 |
+
"eval_steps_per_second": 8.702,
|
| 7574 |
+
"num_input_tokens_seen": 11141120000,
|
| 7575 |
+
"step": 42500
|
| 7576 |
+
},
|
| 7577 |
+
{
|
| 7578 |
+
"epoch": 0.2862125270951783,
|
| 7579 |
+
"grad_norm": 0.7261589169502258,
|
| 7580 |
+
"learning_rate": 0.0005793861608770001,
|
| 7581 |
+
"loss": 12.0856,
|
| 7582 |
+
"num_input_tokens_seen": 11154227200,
|
| 7583 |
+
"step": 42550
|
| 7584 |
+
},
|
| 7585 |
+
{
|
| 7586 |
+
"epoch": 0.2865488520388859,
|
| 7587 |
+
"grad_norm": 0.7352684140205383,
|
| 7588 |
+
"learning_rate": 0.0005785103799048218,
|
| 7589 |
+
"loss": 12.094,
|
| 7590 |
+
"num_input_tokens_seen": 11167334400,
|
| 7591 |
+
"step": 42600
|
| 7592 |
+
},
|
| 7593 |
+
{
|
| 7594 |
+
"epoch": 0.2868851769825935,
|
| 7595 |
+
"grad_norm": 0.650610089302063,
|
| 7596 |
+
"learning_rate": 0.0005776170702821582,
|
| 7597 |
+
"loss": 12.0796,
|
| 7598 |
+
"num_input_tokens_seen": 11180441600,
|
| 7599 |
+
"step": 42650
|
| 7600 |
+
},
|
| 7601 |
+
{
|
| 7602 |
+
"epoch": 0.28722150192630114,
|
| 7603 |
+
"grad_norm": 0.6917529106140137,
|
| 7604 |
+
"learning_rate": 0.0005767062882313743,
|
| 7605 |
+
"loss": 12.0511,
|
| 7606 |
+
"num_input_tokens_seen": 11193548800,
|
| 7607 |
+
"step": 42700
|
| 7608 |
+
},
|
| 7609 |
+
{
|
| 7610 |
+
"epoch": 0.28755782687000875,
|
| 7611 |
+
"grad_norm": 0.8611562252044678,
|
| 7612 |
+
"learning_rate": 0.0005757780910744997,
|
| 7613 |
+
"loss": 12.0772,
|
| 7614 |
+
"num_input_tokens_seen": 11206656000,
|
| 7615 |
+
"step": 42750
|
| 7616 |
+
},
|
| 7617 |
+
{
|
| 7618 |
+
"epoch": 0.28789415181371636,
|
| 7619 |
+
"grad_norm": 0.7321364283561707,
|
| 7620 |
+
"learning_rate": 0.0005748325372296208,
|
| 7621 |
+
"loss": 12.0432,
|
| 7622 |
+
"num_input_tokens_seen": 11219763200,
|
| 7623 |
+
"step": 42800
|
| 7624 |
+
},
|
| 7625 |
+
{
|
| 7626 |
+
"epoch": 0.28823047675742397,
|
| 7627 |
+
"grad_norm": 0.6974388957023621,
|
| 7628 |
+
"learning_rate": 0.0005738696862072053,
|
| 7629 |
+
"loss": 12.0408,
|
| 7630 |
+
"num_input_tokens_seen": 11232870400,
|
| 7631 |
+
"step": 42850
|
| 7632 |
+
},
|
| 7633 |
+
{
|
| 7634 |
+
"epoch": 0.2885668017011316,
|
| 7635 |
+
"grad_norm": 0.6981905102729797,
|
| 7636 |
+
"learning_rate": 0.0005728895986063554,
|
| 7637 |
+
"loss": 12.0419,
|
| 7638 |
+
"num_input_tokens_seen": 11245977600,
|
| 7639 |
+
"step": 42900
|
| 7640 |
+
},
|
| 7641 |
+
{
|
| 7642 |
+
"epoch": 0.2889031266448392,
|
| 7643 |
+
"grad_norm": 0.7019402384757996,
|
| 7644 |
+
"learning_rate": 0.000571892336110995,
|
| 7645 |
+
"loss": 12.0206,
|
| 7646 |
+
"num_input_tokens_seen": 11259084800,
|
| 7647 |
+
"step": 42950
|
| 7648 |
+
},
|
| 7649 |
+
{
|
| 7650 |
+
"epoch": 0.2892394515885468,
|
| 7651 |
+
"grad_norm": 0.7176699042320251,
|
| 7652 |
+
"learning_rate": 0.0005708779614859863,
|
| 7653 |
+
"loss": 12.0641,
|
| 7654 |
+
"num_input_tokens_seen": 11272192000,
|
| 7655 |
+
"step": 43000
|
| 7656 |
+
},
|
| 7657 |
+
{
|
| 7658 |
+
"epoch": 0.2892394515885468,
|
| 7659 |
+
"eval_loss": 2.9219655990600586,
|
| 7660 |
+
"eval_runtime": 144.3813,
|
| 7661 |
+
"eval_samples_per_second": 34.631,
|
| 7662 |
+
"eval_steps_per_second": 8.658,
|
| 7663 |
+
"num_input_tokens_seen": 11272192000,
|
| 7664 |
+
"step": 43000
|
| 7665 |
}
|
| 7666 |
],
|
| 7667 |
"logging_steps": 50,
|
| 7668 |
"max_steps": 60000,
|
| 7669 |
+
"num_input_tokens_seen": 11272192000,
|
| 7670 |
"num_train_epochs": 1,
|
| 7671 |
"save_steps": 1000,
|
| 7672 |
"stateful_callbacks": {
|
|
|
|
| 7681 |
"attributes": {}
|
| 7682 |
}
|
| 7683 |
},
|
| 7684 |
+
"total_flos": 7.18283552587776e+18,
|
| 7685 |
"train_batch_size": 64,
|
| 7686 |
"trial_name": null,
|
| 7687 |
"trial_params": null
|