Training in progress, step 114000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b778ecb426d78f0896855e8fb4aad5b0ed64f4bb1e53aede2d8069fdd044f83f
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e40a86136eefe7a52f906d32b10df1f61bc2559012b7bd8d21fd2f6358ab1422
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f9d7695201cafd8e529bbb705c4e86352c97146b7f2c1d17b903edf259b2912
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a3df12db58d0a78ce660a6cf049d113e8861e8aa8611c9714bf603dc61fb3a9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -20122,11 +20122,189 @@
|
|
| 20122 |
"eval_steps_per_second": 11.42,
|
| 20123 |
"num_input_tokens_seen": 59235047232,
|
| 20124 |
"step": 113000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20125 |
}
|
| 20126 |
],
|
| 20127 |
"logging_steps": 50,
|
| 20128 |
"max_steps": 140000,
|
| 20129 |
-
"num_input_tokens_seen":
|
| 20130 |
"num_train_epochs": 2,
|
| 20131 |
"save_steps": 1000,
|
| 20132 |
"stateful_callbacks": {
|
|
@@ -20141,7 +20319,7 @@
|
|
| 20141 |
"attributes": {}
|
| 20142 |
}
|
| 20143 |
},
|
| 20144 |
-
"total_flos": 1.
|
| 20145 |
"train_batch_size": 32,
|
| 20146 |
"trial_name": null,
|
| 20147 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0875681219218432,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 114000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 20122 |
"eval_steps_per_second": 11.42,
|
| 20123 |
"num_input_tokens_seen": 59235047232,
|
| 20124 |
"step": 113000
|
| 20125 |
+
},
|
| 20126 |
+
{
|
| 20127 |
+
"epoch": 1.0785050741142659,
|
| 20128 |
+
"grad_norm": 0.14696183800697327,
|
| 20129 |
+
"learning_rate": 0.0009965342284774632,
|
| 20130 |
+
"loss": 2.084,
|
| 20131 |
+
"num_input_tokens_seen": 59261256096,
|
| 20132 |
+
"step": 113050
|
| 20133 |
+
},
|
| 20134 |
+
{
|
| 20135 |
+
"epoch": 1.0789820766304543,
|
| 20136 |
+
"grad_norm": 0.1535506546497345,
|
| 20137 |
+
"learning_rate": 0.0009961967251474822,
|
| 20138 |
+
"loss": 2.0905,
|
| 20139 |
+
"num_input_tokens_seen": 59287464384,
|
| 20140 |
+
"step": 113100
|
| 20141 |
+
},
|
| 20142 |
+
{
|
| 20143 |
+
"epoch": 1.0794590791466425,
|
| 20144 |
+
"grad_norm": 0.14321501553058624,
|
| 20145 |
+
"learning_rate": 0.000995843605578539,
|
| 20146 |
+
"loss": 2.0971,
|
| 20147 |
+
"num_input_tokens_seen": 59313669856,
|
| 20148 |
+
"step": 113150
|
| 20149 |
+
},
|
| 20150 |
+
{
|
| 20151 |
+
"epoch": 1.0799360816628307,
|
| 20152 |
+
"grad_norm": 0.15687337517738342,
|
| 20153 |
+
"learning_rate": 0.0009954748808839674,
|
| 20154 |
+
"loss": 2.0864,
|
| 20155 |
+
"num_input_tokens_seen": 59339879328,
|
| 20156 |
+
"step": 113200
|
| 20157 |
+
},
|
| 20158 |
+
{
|
| 20159 |
+
"epoch": 1.080413084179019,
|
| 20160 |
+
"grad_norm": 0.16271081566810608,
|
| 20161 |
+
"learning_rate": 0.000995090562668223,
|
| 20162 |
+
"loss": 2.0948,
|
| 20163 |
+
"num_input_tokens_seen": 59366089088,
|
| 20164 |
+
"step": 113250
|
| 20165 |
+
},
|
| 20166 |
+
{
|
| 20167 |
+
"epoch": 1.0808900866952074,
|
| 20168 |
+
"grad_norm": 0.14683839678764343,
|
| 20169 |
+
"learning_rate": 0.0009946906630265184,
|
| 20170 |
+
"loss": 2.105,
|
| 20171 |
+
"num_input_tokens_seen": 59392300448,
|
| 20172 |
+
"step": 113300
|
| 20173 |
+
},
|
| 20174 |
+
{
|
| 20175 |
+
"epoch": 1.0813670892113956,
|
| 20176 |
+
"grad_norm": 0.15148819983005524,
|
| 20177 |
+
"learning_rate": 0.0009942751945444437,
|
| 20178 |
+
"loss": 2.0814,
|
| 20179 |
+
"num_input_tokens_seen": 59418514560,
|
| 20180 |
+
"step": 113350
|
| 20181 |
+
},
|
| 20182 |
+
{
|
| 20183 |
+
"epoch": 1.0818440917275838,
|
| 20184 |
+
"grad_norm": 0.14587359130382538,
|
| 20185 |
+
"learning_rate": 0.0009938441702975688,
|
| 20186 |
+
"loss": 2.0943,
|
| 20187 |
+
"num_input_tokens_seen": 59444719360,
|
| 20188 |
+
"step": 113400
|
| 20189 |
+
},
|
| 20190 |
+
{
|
| 20191 |
+
"epoch": 1.0823210942437722,
|
| 20192 |
+
"grad_norm": 0.14699944853782654,
|
| 20193 |
+
"learning_rate": 0.0009933976038510332,
|
| 20194 |
+
"loss": 2.0927,
|
| 20195 |
+
"num_input_tokens_seen": 59470933600,
|
| 20196 |
+
"step": 113450
|
| 20197 |
+
},
|
| 20198 |
+
{
|
| 20199 |
+
"epoch": 1.0827980967599604,
|
| 20200 |
+
"grad_norm": 0.14229649305343628,
|
| 20201 |
+
"learning_rate": 0.0009929355092591179,
|
| 20202 |
+
"loss": 2.0985,
|
| 20203 |
+
"num_input_tokens_seen": 59497148000,
|
| 20204 |
+
"step": 113500
|
| 20205 |
+
},
|
| 20206 |
+
{
|
| 20207 |
+
"epoch": 1.0827980967599604,
|
| 20208 |
+
"eval_loss": 2.009983539581299,
|
| 20209 |
+
"eval_runtime": 82.6823,
|
| 20210 |
+
"eval_samples_per_second": 60.472,
|
| 20211 |
+
"eval_steps_per_second": 15.118,
|
| 20212 |
+
"num_input_tokens_seen": 59497148000,
|
| 20213 |
+
"step": 113500
|
| 20214 |
+
},
|
| 20215 |
+
{
|
| 20216 |
+
"epoch": 1.0832750992761486,
|
| 20217 |
+
"grad_norm": 0.14160077273845673,
|
| 20218 |
+
"learning_rate": 0.0009924579010648041,
|
| 20219 |
+
"loss": 2.0935,
|
| 20220 |
+
"num_input_tokens_seen": 59523359584,
|
| 20221 |
+
"step": 113550
|
| 20222 |
+
},
|
| 20223 |
+
{
|
| 20224 |
+
"epoch": 1.083752101792337,
|
| 20225 |
+
"grad_norm": 0.1411445587873459,
|
| 20226 |
+
"learning_rate": 0.0009919647942993148,
|
| 20227 |
+
"loss": 2.093,
|
| 20228 |
+
"num_input_tokens_seen": 59549569568,
|
| 20229 |
+
"step": 113600
|
| 20230 |
+
},
|
| 20231 |
+
{
|
| 20232 |
+
"epoch": 1.0842291043085253,
|
| 20233 |
+
"grad_norm": 0.13501347601413727,
|
| 20234 |
+
"learning_rate": 0.0009914562044816423,
|
| 20235 |
+
"loss": 2.0919,
|
| 20236 |
+
"num_input_tokens_seen": 59575783200,
|
| 20237 |
+
"step": 113650
|
| 20238 |
+
},
|
| 20239 |
+
{
|
| 20240 |
+
"epoch": 1.0847061068247135,
|
| 20241 |
+
"grad_norm": 0.14355099201202393,
|
| 20242 |
+
"learning_rate": 0.0009909321476180592,
|
| 20243 |
+
"loss": 2.0913,
|
| 20244 |
+
"num_input_tokens_seen": 59601990304,
|
| 20245 |
+
"step": 113700
|
| 20246 |
+
},
|
| 20247 |
+
{
|
| 20248 |
+
"epoch": 1.0851831093409017,
|
| 20249 |
+
"grad_norm": 0.13246339559555054,
|
| 20250 |
+
"learning_rate": 0.0009903926402016153,
|
| 20251 |
+
"loss": 2.0803,
|
| 20252 |
+
"num_input_tokens_seen": 59628197120,
|
| 20253 |
+
"step": 113750
|
| 20254 |
+
},
|
| 20255 |
+
{
|
| 20256 |
+
"epoch": 1.08566011185709,
|
| 20257 |
+
"grad_norm": 0.13418996334075928,
|
| 20258 |
+
"learning_rate": 0.0009898376992116178,
|
| 20259 |
+
"loss": 2.1042,
|
| 20260 |
+
"num_input_tokens_seen": 59654409856,
|
| 20261 |
+
"step": 113800
|
| 20262 |
+
},
|
| 20263 |
+
{
|
| 20264 |
+
"epoch": 1.0861371143732783,
|
| 20265 |
+
"grad_norm": 0.15235918760299683,
|
| 20266 |
+
"learning_rate": 0.0009892673421130977,
|
| 20267 |
+
"loss": 2.0987,
|
| 20268 |
+
"num_input_tokens_seen": 59680620096,
|
| 20269 |
+
"step": 113850
|
| 20270 |
+
},
|
| 20271 |
+
{
|
| 20272 |
+
"epoch": 1.0866141168894665,
|
| 20273 |
+
"grad_norm": 0.1395738422870636,
|
| 20274 |
+
"learning_rate": 0.0009886815868562597,
|
| 20275 |
+
"loss": 2.0932,
|
| 20276 |
+
"num_input_tokens_seen": 59706827264,
|
| 20277 |
+
"step": 113900
|
| 20278 |
+
},
|
| 20279 |
+
{
|
| 20280 |
+
"epoch": 1.087091119405655,
|
| 20281 |
+
"grad_norm": 0.1433008313179016,
|
| 20282 |
+
"learning_rate": 0.000988080451875917,
|
| 20283 |
+
"loss": 2.0943,
|
| 20284 |
+
"num_input_tokens_seen": 59733034688,
|
| 20285 |
+
"step": 113950
|
| 20286 |
+
},
|
| 20287 |
+
{
|
| 20288 |
+
"epoch": 1.0875681219218432,
|
| 20289 |
+
"grad_norm": 0.14490137994289398,
|
| 20290 |
+
"learning_rate": 0.0009874639560909118,
|
| 20291 |
+
"loss": 2.1012,
|
| 20292 |
+
"num_input_tokens_seen": 59759249088,
|
| 20293 |
+
"step": 114000
|
| 20294 |
+
},
|
| 20295 |
+
{
|
| 20296 |
+
"epoch": 1.0875681219218432,
|
| 20297 |
+
"eval_loss": 2.0104737281799316,
|
| 20298 |
+
"eval_runtime": 82.5956,
|
| 20299 |
+
"eval_samples_per_second": 60.536,
|
| 20300 |
+
"eval_steps_per_second": 15.134,
|
| 20301 |
+
"num_input_tokens_seen": 59759249088,
|
| 20302 |
+
"step": 114000
|
| 20303 |
}
|
| 20304 |
],
|
| 20305 |
"logging_steps": 50,
|
| 20306 |
"max_steps": 140000,
|
| 20307 |
+
"num_input_tokens_seen": 59759249088,
|
| 20308 |
"num_train_epochs": 2,
|
| 20309 |
"save_steps": 1000,
|
| 20310 |
"stateful_callbacks": {
|
|
|
|
| 20319 |
"attributes": {}
|
| 20320 |
}
|
| 20321 |
},
|
| 20322 |
+
"total_flos": 1.0576297079872635e+20,
|
| 20323 |
"train_batch_size": 32,
|
| 20324 |
"trial_name": null,
|
| 20325 |
"trial_params": null
|