Training in progress, step 66000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91b3f88ddbda82d579d7e857e17e157a938e94cf97682c36dea7a9e8ddcf3d14
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1773eecaec3a2d8883e5d344c33d10650e6ebcee793cb11cc46ab81989c4cf9e
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5506f8ab70fc0520e3fcff77fee663d3576573119296fd847d8ec1a26a45a3cf
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a07bef738a41ab3ac6ef10bbe9890f379f768870bcb200cb24b86bcef1753cd
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11578,11 +11578,189 @@
|
|
| 11578 |
"eval_steps_per_second": 23.41,
|
| 11579 |
"num_input_tokens_seen": 17039355456,
|
| 11580 |
"step": 65000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11581 |
}
|
| 11582 |
],
|
| 11583 |
"logging_steps": 50,
|
| 11584 |
"max_steps": 70000,
|
| 11585 |
-
"num_input_tokens_seen":
|
| 11586 |
"num_train_epochs": 1,
|
| 11587 |
"save_steps": 1000,
|
| 11588 |
"stateful_callbacks": {
|
|
@@ -11597,7 +11775,7 @@
|
|
| 11597 |
"attributes": {}
|
| 11598 |
}
|
| 11599 |
},
|
| 11600 |
-
"total_flos": 4.
|
| 11601 |
"train_batch_size": 64,
|
| 11602 |
"trial_name": null,
|
| 11603 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3148216606842601,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 66000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11578 |
"eval_steps_per_second": 23.41,
|
| 11579 |
"num_input_tokens_seen": 17039355456,
|
| 11580 |
"step": 65000
|
| 11581 |
+
},
|
| 11582 |
+
{
|
| 11583 |
+
"epoch": 0.3102901367804715,
|
| 11584 |
+
"grad_norm": 0.17116400599479675,
|
| 11585 |
+
"learning_rate": 0.000278017467984759,
|
| 11586 |
+
"loss": 2.5504,
|
| 11587 |
+
"num_input_tokens_seen": 17052462656,
|
| 11588 |
+
"step": 65050
|
| 11589 |
+
},
|
| 11590 |
+
{
|
| 11591 |
+
"epoch": 0.3105286380385657,
|
| 11592 |
+
"grad_norm": 0.17055106163024902,
|
| 11593 |
+
"learning_rate": 0.00027300475013022663,
|
| 11594 |
+
"loss": 2.543,
|
| 11595 |
+
"num_input_tokens_seen": 17065569856,
|
| 11596 |
+
"step": 65100
|
| 11597 |
+
},
|
| 11598 |
+
{
|
| 11599 |
+
"epoch": 0.3107671392966598,
|
| 11600 |
+
"grad_norm": 0.17849299311637878,
|
| 11601 |
+
"learning_rate": 0.000268020607911083,
|
| 11602 |
+
"loss": 2.5476,
|
| 11603 |
+
"num_input_tokens_seen": 17078677056,
|
| 11604 |
+
"step": 65150
|
| 11605 |
+
},
|
| 11606 |
+
{
|
| 11607 |
+
"epoch": 0.31100564055475394,
|
| 11608 |
+
"grad_norm": 0.17608341574668884,
|
| 11609 |
+
"learning_rate": 0.0002630656687635007,
|
| 11610 |
+
"loss": 2.5452,
|
| 11611 |
+
"num_input_tokens_seen": 17091784256,
|
| 11612 |
+
"step": 65200
|
| 11613 |
+
},
|
| 11614 |
+
{
|
| 11615 |
+
"epoch": 0.31124414181284804,
|
| 11616 |
+
"grad_norm": 0.19086676836013794,
|
| 11617 |
+
"learning_rate": 0.0002581405564473801,
|
| 11618 |
+
"loss": 2.5562,
|
| 11619 |
+
"num_input_tokens_seen": 17104891456,
|
| 11620 |
+
"step": 65250
|
| 11621 |
+
},
|
| 11622 |
+
{
|
| 11623 |
+
"epoch": 0.3114826430709422,
|
| 11624 |
+
"grad_norm": 0.1721603125333786,
|
| 11625 |
+
"learning_rate": 0.00025324589096782657,
|
| 11626 |
+
"loss": 2.5402,
|
| 11627 |
+
"num_input_tokens_seen": 17117998656,
|
| 11628 |
+
"step": 65300
|
| 11629 |
+
},
|
| 11630 |
+
{
|
| 11631 |
+
"epoch": 0.31172114432903636,
|
| 11632 |
+
"grad_norm": 0.16727598011493683,
|
| 11633 |
+
"learning_rate": 0.00024838228849709997,
|
| 11634 |
+
"loss": 2.5253,
|
| 11635 |
+
"num_input_tokens_seen": 17131105856,
|
| 11636 |
+
"step": 65350
|
| 11637 |
+
},
|
| 11638 |
+
{
|
| 11639 |
+
"epoch": 0.31195964558713046,
|
| 11640 |
+
"grad_norm": 0.1664544939994812,
|
| 11641 |
+
"learning_rate": 0.000243550361297047,
|
| 11642 |
+
"loss": 2.5519,
|
| 11643 |
+
"num_input_tokens_seen": 17144213056,
|
| 11644 |
+
"step": 65400
|
| 11645 |
+
},
|
| 11646 |
+
{
|
| 11647 |
+
"epoch": 0.3121981468452246,
|
| 11648 |
+
"grad_norm": 0.17195752263069153,
|
| 11649 |
+
"learning_rate": 0.00023875071764202561,
|
| 11650 |
+
"loss": 2.5297,
|
| 11651 |
+
"num_input_tokens_seen": 17157320256,
|
| 11652 |
+
"step": 65450
|
| 11653 |
+
},
|
| 11654 |
+
{
|
| 11655 |
+
"epoch": 0.3124366481033187,
|
| 11656 |
+
"grad_norm": 0.19001176953315735,
|
| 11657 |
+
"learning_rate": 0.00023398396174233177,
|
| 11658 |
+
"loss": 2.5439,
|
| 11659 |
+
"num_input_tokens_seen": 17170427456,
|
| 11660 |
+
"step": 65500
|
| 11661 |
+
},
|
| 11662 |
+
{
|
| 11663 |
+
"epoch": 0.3124366481033187,
|
| 11664 |
+
"eval_loss": 2.426327705383301,
|
| 11665 |
+
"eval_runtime": 53.7603,
|
| 11666 |
+
"eval_samples_per_second": 93.005,
|
| 11667 |
+
"eval_steps_per_second": 23.251,
|
| 11668 |
+
"num_input_tokens_seen": 17170427456,
|
| 11669 |
+
"step": 65500
|
| 11670 |
+
},
|
| 11671 |
+
{
|
| 11672 |
+
"epoch": 0.3126751493614129,
|
| 11673 |
+
"grad_norm": 0.17215538024902344,
|
| 11674 |
+
"learning_rate": 0.00022925069366813716,
|
| 11675 |
+
"loss": 2.5442,
|
| 11676 |
+
"num_input_tokens_seen": 17183534656,
|
| 11677 |
+
"step": 65550
|
| 11678 |
+
},
|
| 11679 |
+
{
|
| 11680 |
+
"epoch": 0.31291365061950704,
|
| 11681 |
+
"grad_norm": 0.16736114025115967,
|
| 11682 |
+
"learning_rate": 0.0002245515092739488,
|
| 11683 |
+
"loss": 2.5472,
|
| 11684 |
+
"num_input_tokens_seen": 17196641856,
|
| 11685 |
+
"step": 65600
|
| 11686 |
+
},
|
| 11687 |
+
{
|
| 11688 |
+
"epoch": 0.31315215187760115,
|
| 11689 |
+
"grad_norm": 0.1739792823791504,
|
| 11690 |
+
"learning_rate": 0.00021988700012359863,
|
| 11691 |
+
"loss": 2.5401,
|
| 11692 |
+
"num_input_tokens_seen": 17209749056,
|
| 11693 |
+
"step": 65650
|
| 11694 |
+
},
|
| 11695 |
+
{
|
| 11696 |
+
"epoch": 0.3133906531356953,
|
| 11697 |
+
"grad_norm": 0.17363224923610687,
|
| 11698 |
+
"learning_rate": 0.00021525775341577403,
|
| 11699 |
+
"loss": 2.5539,
|
| 11700 |
+
"num_input_tokens_seen": 17222856256,
|
| 11701 |
+
"step": 65700
|
| 11702 |
+
},
|
| 11703 |
+
{
|
| 11704 |
+
"epoch": 0.3136291543937894,
|
| 11705 |
+
"grad_norm": 0.16787610948085785,
|
| 11706 |
+
"learning_rate": 0.00021066435191009715,
|
| 11707 |
+
"loss": 2.5338,
|
| 11708 |
+
"num_input_tokens_seen": 17235963456,
|
| 11709 |
+
"step": 65750
|
| 11710 |
+
},
|
| 11711 |
+
{
|
| 11712 |
+
"epoch": 0.31386765565188357,
|
| 11713 |
+
"grad_norm": 0.17158125340938568,
|
| 11714 |
+
"learning_rate": 0.00020610737385376348,
|
| 11715 |
+
"loss": 2.5531,
|
| 11716 |
+
"num_input_tokens_seen": 17249070656,
|
| 11717 |
+
"step": 65800
|
| 11718 |
+
},
|
| 11719 |
+
{
|
| 11720 |
+
"epoch": 0.3141061569099777,
|
| 11721 |
+
"grad_norm": 0.1693524569272995,
|
| 11722 |
+
"learning_rate": 0.00020158739290874821,
|
| 11723 |
+
"loss": 2.5286,
|
| 11724 |
+
"num_input_tokens_seen": 17262177856,
|
| 11725 |
+
"step": 65850
|
| 11726 |
+
},
|
| 11727 |
+
{
|
| 11728 |
+
"epoch": 0.31434465816807183,
|
| 11729 |
+
"grad_norm": 0.1730414181947708,
|
| 11730 |
+
"learning_rate": 0.0001971049780795901,
|
| 11731 |
+
"loss": 2.5228,
|
| 11732 |
+
"num_input_tokens_seen": 17275285056,
|
| 11733 |
+
"step": 65900
|
| 11734 |
+
},
|
| 11735 |
+
{
|
| 11736 |
+
"epoch": 0.314583159426166,
|
| 11737 |
+
"grad_norm": 0.16220349073410034,
|
| 11738 |
+
"learning_rate": 0.00019266069364176142,
|
| 11739 |
+
"loss": 2.5445,
|
| 11740 |
+
"num_input_tokens_seen": 17288392256,
|
| 11741 |
+
"step": 65950
|
| 11742 |
+
},
|
| 11743 |
+
{
|
| 11744 |
+
"epoch": 0.3148216606842601,
|
| 11745 |
+
"grad_norm": 0.1605050265789032,
|
| 11746 |
+
"learning_rate": 0.00018825509907063325,
|
| 11747 |
+
"loss": 2.5491,
|
| 11748 |
+
"num_input_tokens_seen": 17301499456,
|
| 11749 |
+
"step": 66000
|
| 11750 |
+
},
|
| 11751 |
+
{
|
| 11752 |
+
"epoch": 0.3148216606842601,
|
| 11753 |
+
"eval_loss": 2.4224469661712646,
|
| 11754 |
+
"eval_runtime": 53.2989,
|
| 11755 |
+
"eval_samples_per_second": 93.811,
|
| 11756 |
+
"eval_steps_per_second": 23.453,
|
| 11757 |
+
"num_input_tokens_seen": 17301499456,
|
| 11758 |
+
"step": 66000
|
| 11759 |
}
|
| 11760 |
],
|
| 11761 |
"logging_steps": 50,
|
| 11762 |
"max_steps": 70000,
|
| 11763 |
+
"num_input_tokens_seen": 17301499456,
|
| 11764 |
"num_train_epochs": 1,
|
| 11765 |
"save_steps": 1000,
|
| 11766 |
"stateful_callbacks": {
|
|
|
|
| 11775 |
"attributes": {}
|
| 11776 |
}
|
| 11777 |
},
|
| 11778 |
+
"total_flos": 4.628319967114691e+18,
|
| 11779 |
"train_batch_size": 64,
|
| 11780 |
"trial_name": null,
|
| 11781 |
"trial_params": null
|