Training in progress, step 141000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a27f87288d8d797a749da5bf4d352cdabd92413a2e35e052af216c7df1f69945
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36756e75b3466f2e619ffcb01fde732bcbed6a8bb6e17f933bd8b701f263e4f2
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95aeb3e8ddbb19f44b8ac55566129494d59b1f0669d87d7f6b45254087f1767e
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7f2c2062cd4eab2105e1d3af30621ba0055a18128fac0ce700a512d64dfcfc4
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24928,12 +24928,190 @@
|
|
| 24928 |
"eval_steps_per_second": 15.131,
|
| 24929 |
"num_input_tokens_seen": 73388446624,
|
| 24930 |
"step": 140000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24931 |
}
|
| 24932 |
],
|
| 24933 |
"logging_steps": 50,
|
| 24934 |
-
"max_steps":
|
| 24935 |
-
"num_input_tokens_seen":
|
| 24936 |
-
"num_train_epochs":
|
| 24937 |
"save_steps": 1000,
|
| 24938 |
"stateful_callbacks": {
|
| 24939 |
"TrainerControl": {
|
|
@@ -24942,12 +25120,12 @@
|
|
| 24942 |
"should_evaluate": false,
|
| 24943 |
"should_log": false,
|
| 24944 |
"should_save": true,
|
| 24945 |
-
"should_training_stop":
|
| 24946 |
},
|
| 24947 |
"attributes": {}
|
| 24948 |
}
|
| 24949 |
},
|
| 24950 |
-
"total_flos": 1.
|
| 24951 |
"train_batch_size": 32,
|
| 24952 |
"trial_name": null,
|
| 24953 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.097265706246529,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 141000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24928 |
"eval_steps_per_second": 15.131,
|
| 24929 |
"num_input_tokens_seen": 73388446624,
|
| 24930 |
"step": 140000
|
| 24931 |
+
},
|
| 24932 |
+
{
|
| 24933 |
+
"epoch": 3.076397950841334,
|
| 24934 |
+
"grad_norm": 0.09252593666315079,
|
| 24935 |
+
"learning_rate": 0.0001,
|
| 24936 |
+
"loss": 2.3535,
|
| 24937 |
+
"num_input_tokens_seen": 73440875424,
|
| 24938 |
+
"step": 140050
|
| 24939 |
+
},
|
| 24940 |
+
{
|
| 24941 |
+
"epoch": 3.0774962537573973,
|
| 24942 |
+
"grad_norm": 0.08520153909921646,
|
| 24943 |
+
"learning_rate": 0.0001,
|
| 24944 |
+
"loss": 2.3529,
|
| 24945 |
+
"num_input_tokens_seen": 73493304224,
|
| 24946 |
+
"step": 140100
|
| 24947 |
+
},
|
| 24948 |
+
{
|
| 24949 |
+
"epoch": 3.07859455667346,
|
| 24950 |
+
"grad_norm": 0.09475487470626831,
|
| 24951 |
+
"learning_rate": 0.0001,
|
| 24952 |
+
"loss": 2.3539,
|
| 24953 |
+
"num_input_tokens_seen": 73545729952,
|
| 24954 |
+
"step": 140150
|
| 24955 |
+
},
|
| 24956 |
+
{
|
| 24957 |
+
"epoch": 3.079692859589523,
|
| 24958 |
+
"grad_norm": 0.08525670319795609,
|
| 24959 |
+
"learning_rate": 0.0001,
|
| 24960 |
+
"loss": 2.3603,
|
| 24961 |
+
"num_input_tokens_seen": 73598155232,
|
| 24962 |
+
"step": 140200
|
| 24963 |
+
},
|
| 24964 |
+
{
|
| 24965 |
+
"epoch": 3.080791162505586,
|
| 24966 |
+
"grad_norm": 0.09414695203304291,
|
| 24967 |
+
"learning_rate": 0.0001,
|
| 24968 |
+
"loss": 2.3596,
|
| 24969 |
+
"num_input_tokens_seen": 73650584032,
|
| 24970 |
+
"step": 140250
|
| 24971 |
+
},
|
| 24972 |
+
{
|
| 24973 |
+
"epoch": 3.0818894654216487,
|
| 24974 |
+
"grad_norm": 0.08829599618911743,
|
| 24975 |
+
"learning_rate": 0.0001,
|
| 24976 |
+
"loss": 2.3582,
|
| 24977 |
+
"num_input_tokens_seen": 73703009408,
|
| 24978 |
+
"step": 140300
|
| 24979 |
+
},
|
| 24980 |
+
{
|
| 24981 |
+
"epoch": 3.082987768337712,
|
| 24982 |
+
"grad_norm": 0.08346480131149292,
|
| 24983 |
+
"learning_rate": 0.0001,
|
| 24984 |
+
"loss": 2.3473,
|
| 24985 |
+
"num_input_tokens_seen": 73755435104,
|
| 24986 |
+
"step": 140350
|
| 24987 |
+
},
|
| 24988 |
+
{
|
| 24989 |
+
"epoch": 3.0840860712537745,
|
| 24990 |
+
"grad_norm": 0.09302923828363419,
|
| 24991 |
+
"learning_rate": 0.0001,
|
| 24992 |
+
"loss": 2.3555,
|
| 24993 |
+
"num_input_tokens_seen": 73807860000,
|
| 24994 |
+
"step": 140400
|
| 24995 |
+
},
|
| 24996 |
+
{
|
| 24997 |
+
"epoch": 3.0851843741698373,
|
| 24998 |
+
"grad_norm": 0.08695721626281738,
|
| 24999 |
+
"learning_rate": 0.0001,
|
| 25000 |
+
"loss": 2.3578,
|
| 25001 |
+
"num_input_tokens_seen": 73860288800,
|
| 25002 |
+
"step": 140450
|
| 25003 |
+
},
|
| 25004 |
+
{
|
| 25005 |
+
"epoch": 3.0862826770859004,
|
| 25006 |
+
"grad_norm": 0.09424284100532532,
|
| 25007 |
+
"learning_rate": 0.0001,
|
| 25008 |
+
"loss": 2.3523,
|
| 25009 |
+
"num_input_tokens_seen": 73912717600,
|
| 25010 |
+
"step": 140500
|
| 25011 |
+
},
|
| 25012 |
+
{
|
| 25013 |
+
"epoch": 3.0862826770859004,
|
| 25014 |
+
"eval_loss": 2.2698493003845215,
|
| 25015 |
+
"eval_runtime": 81.2331,
|
| 25016 |
+
"eval_samples_per_second": 61.551,
|
| 25017 |
+
"eval_steps_per_second": 15.388,
|
| 25018 |
+
"num_input_tokens_seen": 73912717600,
|
| 25019 |
+
"step": 140500
|
| 25020 |
+
},
|
| 25021 |
+
{
|
| 25022 |
+
"epoch": 3.087380980001963,
|
| 25023 |
+
"grad_norm": 0.08606674522161484,
|
| 25024 |
+
"learning_rate": 0.0001,
|
| 25025 |
+
"loss": 2.3589,
|
| 25026 |
+
"num_input_tokens_seen": 73965145984,
|
| 25027 |
+
"step": 140550
|
| 25028 |
+
},
|
| 25029 |
+
{
|
| 25030 |
+
"epoch": 3.0884792829180263,
|
| 25031 |
+
"grad_norm": 0.09220123291015625,
|
| 25032 |
+
"learning_rate": 0.0001,
|
| 25033 |
+
"loss": 2.3503,
|
| 25034 |
+
"num_input_tokens_seen": 74017574784,
|
| 25035 |
+
"step": 140600
|
| 25036 |
+
},
|
| 25037 |
+
{
|
| 25038 |
+
"epoch": 3.089577585834089,
|
| 25039 |
+
"grad_norm": 0.10021138191223145,
|
| 25040 |
+
"learning_rate": 0.0001,
|
| 25041 |
+
"loss": 2.3528,
|
| 25042 |
+
"num_input_tokens_seen": 74070003040,
|
| 25043 |
+
"step": 140650
|
| 25044 |
+
},
|
| 25045 |
+
{
|
| 25046 |
+
"epoch": 3.0906758887501518,
|
| 25047 |
+
"grad_norm": 0.08400563895702362,
|
| 25048 |
+
"learning_rate": 0.0001,
|
| 25049 |
+
"loss": 2.3575,
|
| 25050 |
+
"num_input_tokens_seen": 74122431840,
|
| 25051 |
+
"step": 140700
|
| 25052 |
+
},
|
| 25053 |
+
{
|
| 25054 |
+
"epoch": 3.091774191666215,
|
| 25055 |
+
"grad_norm": 0.08861430734395981,
|
| 25056 |
+
"learning_rate": 0.0001,
|
| 25057 |
+
"loss": 2.3552,
|
| 25058 |
+
"num_input_tokens_seen": 74174859680,
|
| 25059 |
+
"step": 140750
|
| 25060 |
+
},
|
| 25061 |
+
{
|
| 25062 |
+
"epoch": 3.0928724945822776,
|
| 25063 |
+
"grad_norm": 0.08466708660125732,
|
| 25064 |
+
"learning_rate": 0.0001,
|
| 25065 |
+
"loss": 2.3603,
|
| 25066 |
+
"num_input_tokens_seen": 74227284768,
|
| 25067 |
+
"step": 140800
|
| 25068 |
+
},
|
| 25069 |
+
{
|
| 25070 |
+
"epoch": 3.0939707974983404,
|
| 25071 |
+
"grad_norm": 0.08707701414823532,
|
| 25072 |
+
"learning_rate": 0.0001,
|
| 25073 |
+
"loss": 2.3595,
|
| 25074 |
+
"num_input_tokens_seen": 74279711840,
|
| 25075 |
+
"step": 140850
|
| 25076 |
+
},
|
| 25077 |
+
{
|
| 25078 |
+
"epoch": 3.0950691004144035,
|
| 25079 |
+
"grad_norm": 0.08657340705394745,
|
| 25080 |
+
"learning_rate": 0.0001,
|
| 25081 |
+
"loss": 2.3511,
|
| 25082 |
+
"num_input_tokens_seen": 74332140640,
|
| 25083 |
+
"step": 140900
|
| 25084 |
+
},
|
| 25085 |
+
{
|
| 25086 |
+
"epoch": 3.0961674033304663,
|
| 25087 |
+
"grad_norm": 0.08521311730146408,
|
| 25088 |
+
"learning_rate": 0.0001,
|
| 25089 |
+
"loss": 2.3569,
|
| 25090 |
+
"num_input_tokens_seen": 74384569440,
|
| 25091 |
+
"step": 140950
|
| 25092 |
+
},
|
| 25093 |
+
{
|
| 25094 |
+
"epoch": 3.097265706246529,
|
| 25095 |
+
"grad_norm": 0.08738870918750763,
|
| 25096 |
+
"learning_rate": 0.0001,
|
| 25097 |
+
"loss": 2.3587,
|
| 25098 |
+
"num_input_tokens_seen": 74436998240,
|
| 25099 |
+
"step": 141000
|
| 25100 |
+
},
|
| 25101 |
+
{
|
| 25102 |
+
"epoch": 3.097265706246529,
|
| 25103 |
+
"eval_loss": 2.269127607345581,
|
| 25104 |
+
"eval_runtime": 80.825,
|
| 25105 |
+
"eval_samples_per_second": 61.862,
|
| 25106 |
+
"eval_steps_per_second": 15.466,
|
| 25107 |
+
"num_input_tokens_seen": 74436998240,
|
| 25108 |
+
"step": 141000
|
| 25109 |
}
|
| 25110 |
],
|
| 25111 |
"logging_steps": 50,
|
| 25112 |
+
"max_steps": 200000,
|
| 25113 |
+
"num_input_tokens_seen": 74436998240,
|
| 25114 |
+
"num_train_epochs": 5,
|
| 25115 |
"save_steps": 1000,
|
| 25116 |
"stateful_callbacks": {
|
| 25117 |
"TrainerControl": {
|
|
|
|
| 25120 |
"should_evaluate": false,
|
| 25121 |
"should_log": false,
|
| 25122 |
"should_save": true,
|
| 25123 |
+
"should_training_stop": false
|
| 25124 |
},
|
| 25125 |
"attributes": {}
|
| 25126 |
}
|
| 25127 |
},
|
| 25128 |
+
"total_flos": 1.3173990957632102e+20,
|
| 25129 |
"train_batch_size": 32,
|
| 25130 |
"trial_name": null,
|
| 25131 |
"trial_params": null
|
last-checkpoint/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6008
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98ea109117710c0c998ea268594e6a7d0e86331c406b4b50e21b67f4948ff266
|
| 3 |
size 6008
|