Training in progress, step 44000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9605858ca8b64eb89cb8c33fd56e7ec671551b1e5005f2598e074ca5b397cafd
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2584fadfe92de84830b6f68a11ff9f4508f42d733151a8e29faa8885164fa9e
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c93fe38009a049e639e4ec9c47956d4822c559f5ecfd6d8454c217a91259ec7
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5356afb30d3aa5783dfb45e83d3ec8fbfdbc01397770efc134aa996a2dcb7311
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7662,11 +7662,189 @@
|
|
| 7662 |
"eval_steps_per_second": 24.368,
|
| 7663 |
"num_input_tokens_seen": 11272187456,
|
| 7664 |
"step": 43000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7665 |
}
|
| 7666 |
],
|
| 7667 |
"logging_steps": 50,
|
| 7668 |
"max_steps": 70000,
|
| 7669 |
-
"num_input_tokens_seen":
|
| 7670 |
"num_train_epochs": 1,
|
| 7671 |
"save_steps": 1000,
|
| 7672 |
"stateful_callbacks": {
|
|
@@ -7681,7 +7859,7 @@
|
|
| 7681 |
"attributes": {}
|
| 7682 |
}
|
| 7683 |
},
|
| 7684 |
-
"total_flos": 3.
|
| 7685 |
"train_batch_size": 64,
|
| 7686 |
"trial_name": null,
|
| 7687 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.20988110712284008,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 44000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7662 |
"eval_steps_per_second": 24.368,
|
| 7663 |
"num_input_tokens_seen": 11272187456,
|
| 7664 |
"step": 43000
|
| 7665 |
+
},
|
| 7666 |
+
{
|
| 7667 |
+
"epoch": 0.20534958321905147,
|
| 7668 |
+
"grad_norm": 0.22746357321739197,
|
| 7669 |
+
"learning_rate": 0.001,
|
| 7670 |
+
"loss": 2.6281,
|
| 7671 |
+
"num_input_tokens_seen": 11285294656,
|
| 7672 |
+
"step": 43050
|
| 7673 |
+
},
|
| 7674 |
+
{
|
| 7675 |
+
"epoch": 0.2055880844771456,
|
| 7676 |
+
"grad_norm": 0.21107150614261627,
|
| 7677 |
+
"learning_rate": 0.001,
|
| 7678 |
+
"loss": 2.6154,
|
| 7679 |
+
"num_input_tokens_seen": 11298401856,
|
| 7680 |
+
"step": 43100
|
| 7681 |
+
},
|
| 7682 |
+
{
|
| 7683 |
+
"epoch": 0.20582658573523976,
|
| 7684 |
+
"grad_norm": 0.18025045096874237,
|
| 7685 |
+
"learning_rate": 0.001,
|
| 7686 |
+
"loss": 2.6141,
|
| 7687 |
+
"num_input_tokens_seen": 11311509056,
|
| 7688 |
+
"step": 43150
|
| 7689 |
+
},
|
| 7690 |
+
{
|
| 7691 |
+
"epoch": 0.2060650869933339,
|
| 7692 |
+
"grad_norm": 0.2009642869234085,
|
| 7693 |
+
"learning_rate": 0.001,
|
| 7694 |
+
"loss": 2.6133,
|
| 7695 |
+
"num_input_tokens_seen": 11324616256,
|
| 7696 |
+
"step": 43200
|
| 7697 |
+
},
|
| 7698 |
+
{
|
| 7699 |
+
"epoch": 0.20630358825142803,
|
| 7700 |
+
"grad_norm": 0.1872788518667221,
|
| 7701 |
+
"learning_rate": 0.001,
|
| 7702 |
+
"loss": 2.6197,
|
| 7703 |
+
"num_input_tokens_seen": 11337723456,
|
| 7704 |
+
"step": 43250
|
| 7705 |
+
},
|
| 7706 |
+
{
|
| 7707 |
+
"epoch": 0.20654208950952216,
|
| 7708 |
+
"grad_norm": 0.216310054063797,
|
| 7709 |
+
"learning_rate": 0.001,
|
| 7710 |
+
"loss": 2.6353,
|
| 7711 |
+
"num_input_tokens_seen": 11350830656,
|
| 7712 |
+
"step": 43300
|
| 7713 |
+
},
|
| 7714 |
+
{
|
| 7715 |
+
"epoch": 0.2067805907676163,
|
| 7716 |
+
"grad_norm": 0.2705513536930084,
|
| 7717 |
+
"learning_rate": 0.001,
|
| 7718 |
+
"loss": 2.6333,
|
| 7719 |
+
"num_input_tokens_seen": 11363937856,
|
| 7720 |
+
"step": 43350
|
| 7721 |
+
},
|
| 7722 |
+
{
|
| 7723 |
+
"epoch": 0.20701909202571045,
|
| 7724 |
+
"grad_norm": 0.3040550649166107,
|
| 7725 |
+
"learning_rate": 0.001,
|
| 7726 |
+
"loss": 2.6094,
|
| 7727 |
+
"num_input_tokens_seen": 11377045056,
|
| 7728 |
+
"step": 43400
|
| 7729 |
+
},
|
| 7730 |
+
{
|
| 7731 |
+
"epoch": 0.20725759328380458,
|
| 7732 |
+
"grad_norm": 0.2075599879026413,
|
| 7733 |
+
"learning_rate": 0.001,
|
| 7734 |
+
"loss": 2.6225,
|
| 7735 |
+
"num_input_tokens_seen": 11390152256,
|
| 7736 |
+
"step": 43450
|
| 7737 |
+
},
|
| 7738 |
+
{
|
| 7739 |
+
"epoch": 0.2074960945418987,
|
| 7740 |
+
"grad_norm": 0.22293590009212494,
|
| 7741 |
+
"learning_rate": 0.001,
|
| 7742 |
+
"loss": 2.6271,
|
| 7743 |
+
"num_input_tokens_seen": 11403259456,
|
| 7744 |
+
"step": 43500
|
| 7745 |
+
},
|
| 7746 |
+
{
|
| 7747 |
+
"epoch": 0.2074960945418987,
|
| 7748 |
+
"eval_loss": 2.5097975730895996,
|
| 7749 |
+
"eval_runtime": 51.7037,
|
| 7750 |
+
"eval_samples_per_second": 96.705,
|
| 7751 |
+
"eval_steps_per_second": 24.176,
|
| 7752 |
+
"num_input_tokens_seen": 11403259456,
|
| 7753 |
+
"step": 43500
|
| 7754 |
+
},
|
| 7755 |
+
{
|
| 7756 |
+
"epoch": 0.20773459579999284,
|
| 7757 |
+
"grad_norm": 0.21221335232257843,
|
| 7758 |
+
"learning_rate": 0.001,
|
| 7759 |
+
"loss": 2.618,
|
| 7760 |
+
"num_input_tokens_seen": 11416366656,
|
| 7761 |
+
"step": 43550
|
| 7762 |
+
},
|
| 7763 |
+
{
|
| 7764 |
+
"epoch": 0.20797309705808698,
|
| 7765 |
+
"grad_norm": 0.19894948601722717,
|
| 7766 |
+
"learning_rate": 0.001,
|
| 7767 |
+
"loss": 2.6305,
|
| 7768 |
+
"num_input_tokens_seen": 11429473856,
|
| 7769 |
+
"step": 43600
|
| 7770 |
+
},
|
| 7771 |
+
{
|
| 7772 |
+
"epoch": 0.2082115983161811,
|
| 7773 |
+
"grad_norm": 0.29371336102485657,
|
| 7774 |
+
"learning_rate": 0.001,
|
| 7775 |
+
"loss": 2.6211,
|
| 7776 |
+
"num_input_tokens_seen": 11442581056,
|
| 7777 |
+
"step": 43650
|
| 7778 |
+
},
|
| 7779 |
+
{
|
| 7780 |
+
"epoch": 0.20845009957427527,
|
| 7781 |
+
"grad_norm": 0.19441936910152435,
|
| 7782 |
+
"learning_rate": 0.001,
|
| 7783 |
+
"loss": 2.6355,
|
| 7784 |
+
"num_input_tokens_seen": 11455688256,
|
| 7785 |
+
"step": 43700
|
| 7786 |
+
},
|
| 7787 |
+
{
|
| 7788 |
+
"epoch": 0.2086886008323694,
|
| 7789 |
+
"grad_norm": 0.19868114590644836,
|
| 7790 |
+
"learning_rate": 0.001,
|
| 7791 |
+
"loss": 2.6206,
|
| 7792 |
+
"num_input_tokens_seen": 11468795456,
|
| 7793 |
+
"step": 43750
|
| 7794 |
+
},
|
| 7795 |
+
{
|
| 7796 |
+
"epoch": 0.20892710209046353,
|
| 7797 |
+
"grad_norm": 0.19971340894699097,
|
| 7798 |
+
"learning_rate": 0.001,
|
| 7799 |
+
"loss": 2.6124,
|
| 7800 |
+
"num_input_tokens_seen": 11481902656,
|
| 7801 |
+
"step": 43800
|
| 7802 |
+
},
|
| 7803 |
+
{
|
| 7804 |
+
"epoch": 0.20916560334855766,
|
| 7805 |
+
"grad_norm": 0.22261051833629608,
|
| 7806 |
+
"learning_rate": 0.001,
|
| 7807 |
+
"loss": 2.623,
|
| 7808 |
+
"num_input_tokens_seen": 11495009856,
|
| 7809 |
+
"step": 43850
|
| 7810 |
+
},
|
| 7811 |
+
{
|
| 7812 |
+
"epoch": 0.2094041046066518,
|
| 7813 |
+
"grad_norm": 0.20982281863689423,
|
| 7814 |
+
"learning_rate": 0.001,
|
| 7815 |
+
"loss": 2.6182,
|
| 7816 |
+
"num_input_tokens_seen": 11508117056,
|
| 7817 |
+
"step": 43900
|
| 7818 |
+
},
|
| 7819 |
+
{
|
| 7820 |
+
"epoch": 0.20964260586474592,
|
| 7821 |
+
"grad_norm": 0.2216535359621048,
|
| 7822 |
+
"learning_rate": 0.001,
|
| 7823 |
+
"loss": 2.6086,
|
| 7824 |
+
"num_input_tokens_seen": 11521224256,
|
| 7825 |
+
"step": 43950
|
| 7826 |
+
},
|
| 7827 |
+
{
|
| 7828 |
+
"epoch": 0.20988110712284008,
|
| 7829 |
+
"grad_norm": 0.19298988580703735,
|
| 7830 |
+
"learning_rate": 0.001,
|
| 7831 |
+
"loss": 2.6364,
|
| 7832 |
+
"num_input_tokens_seen": 11534331456,
|
| 7833 |
+
"step": 44000
|
| 7834 |
+
},
|
| 7835 |
+
{
|
| 7836 |
+
"epoch": 0.20988110712284008,
|
| 7837 |
+
"eval_loss": 2.5009121894836426,
|
| 7838 |
+
"eval_runtime": 51.4356,
|
| 7839 |
+
"eval_samples_per_second": 97.209,
|
| 7840 |
+
"eval_steps_per_second": 24.302,
|
| 7841 |
+
"num_input_tokens_seen": 11534331456,
|
| 7842 |
+
"step": 44000
|
| 7843 |
}
|
| 7844 |
],
|
| 7845 |
"logging_steps": 50,
|
| 7846 |
"max_steps": 70000,
|
| 7847 |
+
"num_input_tokens_seen": 11534331456,
|
| 7848 |
"num_train_epochs": 1,
|
| 7849 |
"save_steps": 1000,
|
| 7850 |
"stateful_callbacks": {
|
|
|
|
| 7859 |
"attributes": {}
|
| 7860 |
}
|
| 7861 |
},
|
| 7862 |
+
"total_flos": 3.0855462395550106e+18,
|
| 7863 |
"train_batch_size": 64,
|
| 7864 |
"trial_name": null,
|
| 7865 |
"trial_params": null
|