Training in progress, step 130000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67a6c7abe32dd438fb09470397d8599e18c7c6f7d6e5ad7c2ea59aa52e0c0fc9
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af9646577ee4ed03ad7c9691e7703d876a8256d338d3a2fb5035f6f80fe627b5
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1315ef35a655eddf08abff5aa18ec6897fdbfeff08c3f5d07895fadd41b93070
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8acfe6d76758b902ab66b172fa1db8b08d2d4760abe1682738a74d50eadc0c50
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -22970,11 +22970,189 @@
|
|
| 22970 |
"eval_steps_per_second": 15.182,
|
| 22971 |
"num_input_tokens_seen": 67622231264,
|
| 22972 |
"step": 129000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22973 |
}
|
| 22974 |
],
|
| 22975 |
"logging_steps": 50,
|
| 22976 |
"max_steps": 140000,
|
| 22977 |
-
"num_input_tokens_seen":
|
| 22978 |
"num_train_epochs": 2,
|
| 22979 |
"save_steps": 1000,
|
| 22980 |
"stateful_callbacks": {
|
|
@@ -22989,7 +23167,7 @@
|
|
| 22989 |
"attributes": {}
|
| 22990 |
}
|
| 22991 |
},
|
| 22992 |
-
"total_flos": 1.
|
| 22993 |
"train_batch_size": 32,
|
| 22994 |
"trial_name": null,
|
| 22995 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2402089271020904,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 130000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 22970 |
"eval_steps_per_second": 15.182,
|
| 22971 |
"num_input_tokens_seen": 67622231264,
|
| 22972 |
"step": 129000
|
| 22973 |
+
},
|
| 22974 |
+
{
|
| 22975 |
+
"epoch": 1.2311458792945134,
|
| 22976 |
+
"grad_norm": 0.13078469038009644,
|
| 22977 |
+
"learning_rate": 0.00033221549498448967,
|
| 22978 |
+
"loss": 2.0474,
|
| 22979 |
+
"num_input_tokens_seen": 67648445664,
|
| 22980 |
+
"step": 129050
|
| 22981 |
+
},
|
| 22982 |
+
{
|
| 22983 |
+
"epoch": 1.2316228818107016,
|
| 22984 |
+
"grad_norm": 0.1259986162185669,
|
| 22985 |
+
"learning_rate": 0.0003295758014387375,
|
| 22986 |
+
"loss": 2.0605,
|
| 22987 |
+
"num_input_tokens_seen": 67674654432,
|
| 22988 |
+
"step": 129100
|
| 22989 |
+
},
|
| 22990 |
+
{
|
| 22991 |
+
"epoch": 1.2320998843268898,
|
| 22992 |
+
"grad_norm": 0.13479039072990417,
|
| 22993 |
+
"learning_rate": 0.0003269414714612534,
|
| 22994 |
+
"loss": 2.0499,
|
| 22995 |
+
"num_input_tokens_seen": 67700854208,
|
| 22996 |
+
"step": 129150
|
| 22997 |
+
},
|
| 22998 |
+
{
|
| 22999 |
+
"epoch": 1.232576886843078,
|
| 23000 |
+
"grad_norm": 0.12382933497428894,
|
| 23001 |
+
"learning_rate": 0.0003243125879593286,
|
| 23002 |
+
"loss": 2.0403,
|
| 23003 |
+
"num_input_tokens_seen": 67727067232,
|
| 23004 |
+
"step": 129200
|
| 23005 |
+
},
|
| 23006 |
+
{
|
| 23007 |
+
"epoch": 1.2330538893592664,
|
| 23008 |
+
"grad_norm": 0.13765262067317963,
|
| 23009 |
+
"learning_rate": 0.0003216892336688435,
|
| 23010 |
+
"loss": 2.05,
|
| 23011 |
+
"num_input_tokens_seen": 67753274144,
|
| 23012 |
+
"step": 129250
|
| 23013 |
+
},
|
| 23014 |
+
{
|
| 23015 |
+
"epoch": 1.2335308918754546,
|
| 23016 |
+
"grad_norm": 0.13626757264137268,
|
| 23017 |
+
"learning_rate": 0.000319071491151664,
|
| 23018 |
+
"loss": 2.0533,
|
| 23019 |
+
"num_input_tokens_seen": 67779485312,
|
| 23020 |
+
"step": 129300
|
| 23021 |
+
},
|
| 23022 |
+
{
|
| 23023 |
+
"epoch": 1.2340078943916428,
|
| 23024 |
+
"grad_norm": 0.13541923463344574,
|
| 23025 |
+
"learning_rate": 0.00031645944279304295,
|
| 23026 |
+
"loss": 2.0502,
|
| 23027 |
+
"num_input_tokens_seen": 67805697216,
|
| 23028 |
+
"step": 129350
|
| 23029 |
+
},
|
| 23030 |
+
{
|
| 23031 |
+
"epoch": 1.2344848969078313,
|
| 23032 |
+
"grad_norm": 0.12669889628887177,
|
| 23033 |
+
"learning_rate": 0.00031385317079902743,
|
| 23034 |
+
"loss": 2.0434,
|
| 23035 |
+
"num_input_tokens_seen": 67831908160,
|
| 23036 |
+
"step": 129400
|
| 23037 |
+
},
|
| 23038 |
+
{
|
| 23039 |
+
"epoch": 1.2349618994240195,
|
| 23040 |
+
"grad_norm": 0.12400075793266296,
|
| 23041 |
+
"learning_rate": 0.0003112527571938717,
|
| 23042 |
+
"loss": 2.0556,
|
| 23043 |
+
"num_input_tokens_seen": 67858116736,
|
| 23044 |
+
"step": 129450
|
| 23045 |
+
},
|
| 23046 |
+
{
|
| 23047 |
+
"epoch": 1.2354389019402077,
|
| 23048 |
+
"grad_norm": 0.13263045251369476,
|
| 23049 |
+
"learning_rate": 0.0003086582838174551,
|
| 23050 |
+
"loss": 2.0405,
|
| 23051 |
+
"num_input_tokens_seen": 67884327168,
|
| 23052 |
+
"step": 129500
|
| 23053 |
+
},
|
| 23054 |
+
{
|
| 23055 |
+
"epoch": 1.2354389019402077,
|
| 23056 |
+
"eval_loss": 1.966764211654663,
|
| 23057 |
+
"eval_runtime": 82.4836,
|
| 23058 |
+
"eval_samples_per_second": 60.618,
|
| 23059 |
+
"eval_steps_per_second": 15.155,
|
| 23060 |
+
"num_input_tokens_seen": 67884327168,
|
| 23061 |
+
"step": 129500
|
| 23062 |
+
},
|
| 23063 |
+
{
|
| 23064 |
+
"epoch": 1.235915904456396,
|
| 23065 |
+
"grad_norm": 0.12067709863185883,
|
| 23066 |
+
"learning_rate": 0.00030606983232270746,
|
| 23067 |
+
"loss": 2.0511,
|
| 23068 |
+
"num_input_tokens_seen": 67910538880,
|
| 23069 |
+
"step": 129550
|
| 23070 |
+
},
|
| 23071 |
+
{
|
| 23072 |
+
"epoch": 1.2363929069725843,
|
| 23073 |
+
"grad_norm": 0.13021409511566162,
|
| 23074 |
+
"learning_rate": 0.0003034874841730382,
|
| 23075 |
+
"loss": 2.0525,
|
| 23076 |
+
"num_input_tokens_seen": 67936753280,
|
| 23077 |
+
"step": 129600
|
| 23078 |
+
},
|
| 23079 |
+
{
|
| 23080 |
+
"epoch": 1.2368699094887725,
|
| 23081 |
+
"grad_norm": 0.12661676108837128,
|
| 23082 |
+
"learning_rate": 0.0003009113206397734,
|
| 23083 |
+
"loss": 2.0575,
|
| 23084 |
+
"num_input_tokens_seen": 67962958784,
|
| 23085 |
+
"step": 129650
|
| 23086 |
+
},
|
| 23087 |
+
{
|
| 23088 |
+
"epoch": 1.237346912004961,
|
| 23089 |
+
"grad_norm": 0.12730489671230316,
|
| 23090 |
+
"learning_rate": 0.0002983414227995975,
|
| 23091 |
+
"loss": 2.0552,
|
| 23092 |
+
"num_input_tokens_seen": 67989169536,
|
| 23093 |
+
"step": 129700
|
| 23094 |
+
},
|
| 23095 |
+
{
|
| 23096 |
+
"epoch": 1.2378239145211491,
|
| 23097 |
+
"grad_norm": 0.12583428621292114,
|
| 23098 |
+
"learning_rate": 0.000295777871532002,
|
| 23099 |
+
"loss": 2.0413,
|
| 23100 |
+
"num_input_tokens_seen": 68015382560,
|
| 23101 |
+
"step": 129750
|
| 23102 |
+
},
|
| 23103 |
+
{
|
| 23104 |
+
"epoch": 1.2383009170373374,
|
| 23105 |
+
"grad_norm": 0.12833881378173828,
|
| 23106 |
+
"learning_rate": 0.00029322074751673977,
|
| 23107 |
+
"loss": 2.0456,
|
| 23108 |
+
"num_input_tokens_seen": 68041596960,
|
| 23109 |
+
"step": 129800
|
| 23110 |
+
},
|
| 23111 |
+
{
|
| 23112 |
+
"epoch": 1.2387779195535256,
|
| 23113 |
+
"grad_norm": 0.1263890564441681,
|
| 23114 |
+
"learning_rate": 0.0002906701312312861,
|
| 23115 |
+
"loss": 2.0506,
|
| 23116 |
+
"num_input_tokens_seen": 68067805312,
|
| 23117 |
+
"step": 129850
|
| 23118 |
+
},
|
| 23119 |
+
{
|
| 23120 |
+
"epoch": 1.239254922069714,
|
| 23121 |
+
"grad_norm": 0.1265845000743866,
|
| 23122 |
+
"learning_rate": 0.0002881261029483057,
|
| 23123 |
+
"loss": 2.0376,
|
| 23124 |
+
"num_input_tokens_seen": 68094019712,
|
| 23125 |
+
"step": 129900
|
| 23126 |
+
},
|
| 23127 |
+
{
|
| 23128 |
+
"epoch": 1.2397319245859022,
|
| 23129 |
+
"grad_norm": 0.1379150003194809,
|
| 23130 |
+
"learning_rate": 0.0002855887427331267,
|
| 23131 |
+
"loss": 2.0482,
|
| 23132 |
+
"num_input_tokens_seen": 68120232192,
|
| 23133 |
+
"step": 129950
|
| 23134 |
+
},
|
| 23135 |
+
{
|
| 23136 |
+
"epoch": 1.2402089271020904,
|
| 23137 |
+
"grad_norm": 0.12455019354820251,
|
| 23138 |
+
"learning_rate": 0.00028305813044122096,
|
| 23139 |
+
"loss": 2.038,
|
| 23140 |
+
"num_input_tokens_seen": 68146442176,
|
| 23141 |
+
"step": 130000
|
| 23142 |
+
},
|
| 23143 |
+
{
|
| 23144 |
+
"epoch": 1.2402089271020904,
|
| 23145 |
+
"eval_loss": 1.965224266052246,
|
| 23146 |
+
"eval_runtime": 83.0846,
|
| 23147 |
+
"eval_samples_per_second": 60.18,
|
| 23148 |
+
"eval_steps_per_second": 15.045,
|
| 23149 |
+
"num_input_tokens_seen": 68146442176,
|
| 23150 |
+
"step": 130000
|
| 23151 |
}
|
| 23152 |
],
|
| 23153 |
"logging_steps": 50,
|
| 23154 |
"max_steps": 140000,
|
| 23155 |
+
"num_input_tokens_seen": 68146442176,
|
| 23156 |
"num_train_epochs": 2,
|
| 23157 |
"save_steps": 1000,
|
| 23158 |
"stateful_callbacks": {
|
|
|
|
| 23167 |
"attributes": {}
|
| 23168 |
}
|
| 23169 |
},
|
| 23170 |
+
"total_flos": 1.206067727404671e+20,
|
| 23171 |
"train_batch_size": 32,
|
| 23172 |
"trial_name": null,
|
| 23173 |
"trial_params": null
|