Training in progress, step 128000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8c3d49b98bfff4ce201de8fd57e1cb46f198541be8429619ddab0ad9d2161b3
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29a7f0ec5937d8a36844081698ed35de214589f8d2c33900c6101538c2a4386f
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:065d5f05cf0a782fa5b97e409b16ef2b4cf8c6102c4a9437ad899a13c927398f
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9d00381a4191263086f00c86313941ab13504158fdcedfcfacd5f658d7b3729
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -22614,11 +22614,189 @@
|
|
| 22614 |
"eval_steps_per_second": 15.182,
|
| 22615 |
"num_input_tokens_seen": 66573856704,
|
| 22616 |
"step": 127000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22617 |
}
|
| 22618 |
],
|
| 22619 |
"logging_steps": 50,
|
| 22620 |
"max_steps": 140000,
|
| 22621 |
-
"num_input_tokens_seen":
|
| 22622 |
"num_train_epochs": 2,
|
| 22623 |
"save_steps": 1000,
|
| 22624 |
"stateful_callbacks": {
|
|
@@ -22633,7 +22811,7 @@
|
|
| 22633 |
"attributes": {}
|
| 22634 |
}
|
| 22635 |
},
|
| 22636 |
-
"total_flos": 1.
|
| 22637 |
"train_batch_size": 32,
|
| 22638 |
"trial_name": null,
|
| 22639 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2211288264545597,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 128000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 22614 |
"eval_steps_per_second": 15.182,
|
| 22615 |
"num_input_tokens_seen": 66573856704,
|
| 22616 |
"step": 127000
|
| 22617 |
+
},
|
| 22618 |
+
{
|
| 22619 |
+
"epoch": 1.2120657786469824,
|
| 22620 |
+
"grad_norm": 0.13154049217700958,
|
| 22621 |
+
"learning_rate": 0.00044123130127108126,
|
| 22622 |
+
"loss": 2.0525,
|
| 22623 |
+
"num_input_tokens_seen": 66600067712,
|
| 22624 |
+
"step": 127050
|
| 22625 |
+
},
|
| 22626 |
+
{
|
| 22627 |
+
"epoch": 1.2125427811631706,
|
| 22628 |
+
"grad_norm": 0.13129626214504242,
|
| 22629 |
+
"learning_rate": 0.00043844669015467863,
|
| 22630 |
+
"loss": 2.0411,
|
| 22631 |
+
"num_input_tokens_seen": 66626274400,
|
| 22632 |
+
"step": 127100
|
| 22633 |
+
},
|
| 22634 |
+
{
|
| 22635 |
+
"epoch": 1.2130197836793588,
|
| 22636 |
+
"grad_norm": 0.12721647322177887,
|
| 22637 |
+
"learning_rate": 0.0004356640162360581,
|
| 22638 |
+
"loss": 2.0469,
|
| 22639 |
+
"num_input_tokens_seen": 66652487040,
|
| 22640 |
+
"step": 127150
|
| 22641 |
+
},
|
| 22642 |
+
{
|
| 22643 |
+
"epoch": 1.2134967861955472,
|
| 22644 |
+
"grad_norm": 0.1383296549320221,
|
| 22645 |
+
"learning_rate": 0.0004328833670911724,
|
| 22646 |
+
"loss": 2.0578,
|
| 22647 |
+
"num_input_tokens_seen": 66678700288,
|
| 22648 |
+
"step": 127200
|
| 22649 |
+
},
|
| 22650 |
+
{
|
| 22651 |
+
"epoch": 1.2139737887117354,
|
| 22652 |
+
"grad_norm": 0.12966816127300262,
|
| 22653 |
+
"learning_rate": 0.00043010483023225046,
|
| 22654 |
+
"loss": 2.0544,
|
| 22655 |
+
"num_input_tokens_seen": 66704910336,
|
| 22656 |
+
"step": 127250
|
| 22657 |
+
},
|
| 22658 |
+
{
|
| 22659 |
+
"epoch": 1.2144507912279237,
|
| 22660 |
+
"grad_norm": 0.13144998252391815,
|
| 22661 |
+
"learning_rate": 0.0004273284931050438,
|
| 22662 |
+
"loss": 2.061,
|
| 22663 |
+
"num_input_tokens_seen": 66731122112,
|
| 22664 |
+
"step": 127300
|
| 22665 |
+
},
|
| 22666 |
+
{
|
| 22667 |
+
"epoch": 1.214927793744112,
|
| 22668 |
+
"grad_norm": 0.13422222435474396,
|
| 22669 |
+
"learning_rate": 0.0004245544430860743,
|
| 22670 |
+
"loss": 2.062,
|
| 22671 |
+
"num_input_tokens_seen": 66757331872,
|
| 22672 |
+
"step": 127350
|
| 22673 |
+
},
|
| 22674 |
+
{
|
| 22675 |
+
"epoch": 1.2154047962603003,
|
| 22676 |
+
"grad_norm": 0.1333204060792923,
|
| 22677 |
+
"learning_rate": 0.0004217827674798845,
|
| 22678 |
+
"loss": 2.0538,
|
| 22679 |
+
"num_input_tokens_seen": 66783545248,
|
| 22680 |
+
"step": 127400
|
| 22681 |
+
},
|
| 22682 |
+
{
|
| 22683 |
+
"epoch": 1.2158817987764885,
|
| 22684 |
+
"grad_norm": 0.13239559531211853,
|
| 22685 |
+
"learning_rate": 0.0004190135535162894,
|
| 22686 |
+
"loss": 2.0545,
|
| 22687 |
+
"num_input_tokens_seen": 66809758656,
|
| 22688 |
+
"step": 127450
|
| 22689 |
+
},
|
| 22690 |
+
{
|
| 22691 |
+
"epoch": 1.2163588012926767,
|
| 22692 |
+
"grad_norm": 0.13535359501838684,
|
| 22693 |
+
"learning_rate": 0.00041624688834763184,
|
| 22694 |
+
"loss": 2.0625,
|
| 22695 |
+
"num_input_tokens_seen": 66835970592,
|
| 22696 |
+
"step": 127500
|
| 22697 |
+
},
|
| 22698 |
+
{
|
| 22699 |
+
"epoch": 1.2163588012926767,
|
| 22700 |
+
"eval_loss": 1.9728902578353882,
|
| 22701 |
+
"eval_runtime": 82.272,
|
| 22702 |
+
"eval_samples_per_second": 60.774,
|
| 22703 |
+
"eval_steps_per_second": 15.194,
|
| 22704 |
+
"num_input_tokens_seen": 66835970592,
|
| 22705 |
+
"step": 127500
|
| 22706 |
+
},
|
| 22707 |
+
{
|
| 22708 |
+
"epoch": 1.2168358038088651,
|
| 22709 |
+
"grad_norm": 0.1306886225938797,
|
| 22710 |
+
"learning_rate": 0.0004134828590460387,
|
| 22711 |
+
"loss": 2.0548,
|
| 22712 |
+
"num_input_tokens_seen": 66862174016,
|
| 22713 |
+
"step": 127550
|
| 22714 |
+
},
|
| 22715 |
+
{
|
| 22716 |
+
"epoch": 1.2173128063250533,
|
| 22717 |
+
"grad_norm": 0.1322244554758072,
|
| 22718 |
+
"learning_rate": 0.0004107215526006817,
|
| 22719 |
+
"loss": 2.0544,
|
| 22720 |
+
"num_input_tokens_seen": 66888384224,
|
| 22721 |
+
"step": 127600
|
| 22722 |
+
},
|
| 22723 |
+
{
|
| 22724 |
+
"epoch": 1.2177898088412416,
|
| 22725 |
+
"grad_norm": 0.13241881132125854,
|
| 22726 |
+
"learning_rate": 0.0004079630559150391,
|
| 22727 |
+
"loss": 2.0646,
|
| 22728 |
+
"num_input_tokens_seen": 66914597888,
|
| 22729 |
+
"step": 127650
|
| 22730 |
+
},
|
| 22731 |
+
{
|
| 22732 |
+
"epoch": 1.21826681135743,
|
| 22733 |
+
"grad_norm": 0.12745130062103271,
|
| 22734 |
+
"learning_rate": 0.0004052074558041608,
|
| 22735 |
+
"loss": 2.0554,
|
| 22736 |
+
"num_input_tokens_seen": 66940807552,
|
| 22737 |
+
"step": 127700
|
| 22738 |
+
},
|
| 22739 |
+
{
|
| 22740 |
+
"epoch": 1.2187438138736182,
|
| 22741 |
+
"grad_norm": 0.13167862594127655,
|
| 22742 |
+
"learning_rate": 0.00040245483899193594,
|
| 22743 |
+
"loss": 2.0449,
|
| 22744 |
+
"num_input_tokens_seen": 66967017376,
|
| 22745 |
+
"step": 127750
|
| 22746 |
+
},
|
| 22747 |
+
{
|
| 22748 |
+
"epoch": 1.2192208163898064,
|
| 22749 |
+
"grad_norm": 0.1641312688589096,
|
| 22750 |
+
"learning_rate": 0.00039970529210836363,
|
| 22751 |
+
"loss": 2.0438,
|
| 22752 |
+
"num_input_tokens_seen": 66993229600,
|
| 22753 |
+
"step": 127800
|
| 22754 |
+
},
|
| 22755 |
+
{
|
| 22756 |
+
"epoch": 1.2196978189059948,
|
| 22757 |
+
"grad_norm": 0.1290162205696106,
|
| 22758 |
+
"learning_rate": 0.00039695890168682686,
|
| 22759 |
+
"loss": 2.0633,
|
| 22760 |
+
"num_input_tokens_seen": 67019433984,
|
| 22761 |
+
"step": 127850
|
| 22762 |
+
},
|
| 22763 |
+
{
|
| 22764 |
+
"epoch": 1.220174821422183,
|
| 22765 |
+
"grad_norm": 0.12822365760803223,
|
| 22766 |
+
"learning_rate": 0.0003942157541613686,
|
| 22767 |
+
"loss": 2.0477,
|
| 22768 |
+
"num_input_tokens_seen": 67045643168,
|
| 22769 |
+
"step": 127900
|
| 22770 |
+
},
|
| 22771 |
+
{
|
| 22772 |
+
"epoch": 1.2206518239383712,
|
| 22773 |
+
"grad_norm": 0.13961108028888702,
|
| 22774 |
+
"learning_rate": 0.0003914759358639719,
|
| 22775 |
+
"loss": 2.063,
|
| 22776 |
+
"num_input_tokens_seen": 67071854592,
|
| 22777 |
+
"step": 127950
|
| 22778 |
+
},
|
| 22779 |
+
{
|
| 22780 |
+
"epoch": 1.2211288264545597,
|
| 22781 |
+
"grad_norm": 0.13082347810268402,
|
| 22782 |
+
"learning_rate": 0.00038873953302184284,
|
| 22783 |
+
"loss": 2.0557,
|
| 22784 |
+
"num_input_tokens_seen": 67098059328,
|
| 22785 |
+
"step": 128000
|
| 22786 |
+
},
|
| 22787 |
+
{
|
| 22788 |
+
"epoch": 1.2211288264545597,
|
| 22789 |
+
"eval_loss": 1.9715449810028076,
|
| 22790 |
+
"eval_runtime": 83.7065,
|
| 22791 |
+
"eval_samples_per_second": 59.733,
|
| 22792 |
+
"eval_steps_per_second": 14.933,
|
| 22793 |
+
"num_input_tokens_seen": 67098059328,
|
| 22794 |
+
"step": 128000
|
| 22795 |
}
|
| 22796 |
],
|
| 22797 |
"logging_steps": 50,
|
| 22798 |
"max_steps": 140000,
|
| 22799 |
+
"num_input_tokens_seen": 67098059328,
|
| 22800 |
"num_train_epochs": 2,
|
| 22801 |
"save_steps": 1000,
|
| 22802 |
"stateful_callbacks": {
|
|
|
|
| 22811 |
"attributes": {}
|
| 22812 |
}
|
| 22813 |
},
|
| 22814 |
+
"total_flos": 1.1875132632453857e+20,
|
| 22815 |
"train_batch_size": 32,
|
| 22816 |
"trial_name": null,
|
| 22817 |
"trial_params": null
|