Training in progress, step 126000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:710baf14c92f1a6ab3eef32ca39e73342de5da970d1c32a072279db6a546bd6e
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3dde6003afedc6dd2fd3bca69826bc4c2467f2fe522f76deae105d064b39f61f
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90e596b43a0993defe8386429a74c73648ebeab624d8851d1dff893410d726b8
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5773cfed09936b668e41d5a19336896fe4fe897bf551564d5056fa5a83c98331
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -22258,11 +22258,189 @@
|
|
| 22258 |
"eval_steps_per_second": 15.182,
|
| 22259 |
"num_input_tokens_seen": 65525493280,
|
| 22260 |
"step": 125000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22261 |
}
|
| 22262 |
],
|
| 22263 |
"logging_steps": 50,
|
| 22264 |
"max_steps": 140000,
|
| 22265 |
-
"num_input_tokens_seen":
|
| 22266 |
"num_train_epochs": 2,
|
| 22267 |
"save_steps": 1000,
|
| 22268 |
"stateful_callbacks": {
|
|
@@ -22277,7 +22455,7 @@
|
|
| 22277 |
"attributes": {}
|
| 22278 |
}
|
| 22279 |
},
|
| 22280 |
-
"total_flos": 1.
|
| 22281 |
"train_batch_size": 32,
|
| 22282 |
"trial_name": null,
|
| 22283 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2020487258070287,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 126000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 22258 |
"eval_steps_per_second": 15.182,
|
| 22259 |
"num_input_tokens_seen": 65525493280,
|
| 22260 |
"step": 125000
|
| 22261 |
+
},
|
| 22262 |
+
{
|
| 22263 |
+
"epoch": 1.1929856779994514,
|
| 22264 |
+
"grad_norm": 0.13272584974765778,
|
| 22265 |
+
"learning_rate": 0.0005531940155086557,
|
| 22266 |
+
"loss": 2.0602,
|
| 22267 |
+
"num_input_tokens_seen": 65551700064,
|
| 22268 |
+
"step": 125050
|
| 22269 |
+
},
|
| 22270 |
+
{
|
| 22271 |
+
"epoch": 1.1934626805156396,
|
| 22272 |
+
"grad_norm": 0.14066773653030396,
|
| 22273 |
+
"learning_rate": 0.0005504041188505022,
|
| 22274 |
+
"loss": 2.0695,
|
| 22275 |
+
"num_input_tokens_seen": 65577910784,
|
| 22276 |
+
"step": 125100
|
| 22277 |
+
},
|
| 22278 |
+
{
|
| 22279 |
+
"epoch": 1.193939683031828,
|
| 22280 |
+
"grad_norm": 0.13133113086223602,
|
| 22281 |
+
"learning_rate": 0.0005476126358804593,
|
| 22282 |
+
"loss": 2.0686,
|
| 22283 |
+
"num_input_tokens_seen": 65604124224,
|
| 22284 |
+
"step": 125150
|
| 22285 |
+
},
|
| 22286 |
+
{
|
| 22287 |
+
"epoch": 1.1944166855480163,
|
| 22288 |
+
"grad_norm": 0.13990654051303864,
|
| 22289 |
+
"learning_rate": 0.0005448196544517168,
|
| 22290 |
+
"loss": 2.0532,
|
| 22291 |
+
"num_input_tokens_seen": 65630324960,
|
| 22292 |
+
"step": 125200
|
| 22293 |
+
},
|
| 22294 |
+
{
|
| 22295 |
+
"epoch": 1.1948936880642045,
|
| 22296 |
+
"grad_norm": 0.14154765009880066,
|
| 22297 |
+
"learning_rate": 0.0005420252624646238,
|
| 22298 |
+
"loss": 2.0518,
|
| 22299 |
+
"num_input_tokens_seen": 65656532992,
|
| 22300 |
+
"step": 125250
|
| 22301 |
+
},
|
| 22302 |
+
{
|
| 22303 |
+
"epoch": 1.195370690580393,
|
| 22304 |
+
"grad_norm": 0.13149969279766083,
|
| 22305 |
+
"learning_rate": 0.0005392295478639225,
|
| 22306 |
+
"loss": 2.0619,
|
| 22307 |
+
"num_input_tokens_seen": 65682736768,
|
| 22308 |
+
"step": 125300
|
| 22309 |
+
},
|
| 22310 |
+
{
|
| 22311 |
+
"epoch": 1.1958476930965811,
|
| 22312 |
+
"grad_norm": 0.1339765191078186,
|
| 22313 |
+
"learning_rate": 0.0005364325986359802,
|
| 22314 |
+
"loss": 2.0706,
|
| 22315 |
+
"num_input_tokens_seen": 65708951168,
|
| 22316 |
+
"step": 125350
|
| 22317 |
+
},
|
| 22318 |
+
{
|
| 22319 |
+
"epoch": 1.1963246956127693,
|
| 22320 |
+
"grad_norm": 0.13910150527954102,
|
| 22321 |
+
"learning_rate": 0.0005336345028060199,
|
| 22322 |
+
"loss": 2.0596,
|
| 22323 |
+
"num_input_tokens_seen": 65735165568,
|
| 22324 |
+
"step": 125400
|
| 22325 |
+
},
|
| 22326 |
+
{
|
| 22327 |
+
"epoch": 1.1968016981289575,
|
| 22328 |
+
"grad_norm": 0.1447630077600479,
|
| 22329 |
+
"learning_rate": 0.0005308353484353508,
|
| 22330 |
+
"loss": 2.0518,
|
| 22331 |
+
"num_input_tokens_seen": 65761369888,
|
| 22332 |
+
"step": 125450
|
| 22333 |
+
},
|
| 22334 |
+
{
|
| 22335 |
+
"epoch": 1.197278700645146,
|
| 22336 |
+
"grad_norm": 0.13201679289340973,
|
| 22337 |
+
"learning_rate": 0.0005280352236185959,
|
| 22338 |
+
"loss": 2.0645,
|
| 22339 |
+
"num_input_tokens_seen": 65787582144,
|
| 22340 |
+
"step": 125500
|
| 22341 |
+
},
|
| 22342 |
+
{
|
| 22343 |
+
"epoch": 1.197278700645146,
|
| 22344 |
+
"eval_loss": 1.9799100160598755,
|
| 22345 |
+
"eval_runtime": 83.01,
|
| 22346 |
+
"eval_samples_per_second": 60.234,
|
| 22347 |
+
"eval_steps_per_second": 15.058,
|
| 22348 |
+
"num_input_tokens_seen": 65787582144,
|
| 22349 |
+
"step": 125500
|
| 22350 |
+
},
|
| 22351 |
+
{
|
| 22352 |
+
"epoch": 1.1977557031613342,
|
| 22353 |
+
"grad_norm": 0.1335040032863617,
|
| 22354 |
+
"learning_rate": 0.0005252342164809204,
|
| 22355 |
+
"loss": 2.0597,
|
| 22356 |
+
"num_input_tokens_seen": 65813796352,
|
| 22357 |
+
"step": 125550
|
| 22358 |
+
},
|
| 22359 |
+
{
|
| 22360 |
+
"epoch": 1.1982327056775224,
|
| 22361 |
+
"grad_norm": 0.13693130016326904,
|
| 22362 |
+
"learning_rate": 0.0005224324151752575,
|
| 22363 |
+
"loss": 2.0594,
|
| 22364 |
+
"num_input_tokens_seen": 65840010208,
|
| 22365 |
+
"step": 125600
|
| 22366 |
+
},
|
| 22367 |
+
{
|
| 22368 |
+
"epoch": 1.1987097081937108,
|
| 22369 |
+
"grad_norm": 0.13866880536079407,
|
| 22370 |
+
"learning_rate": 0.0005196299078795343,
|
| 22371 |
+
"loss": 2.0511,
|
| 22372 |
+
"num_input_tokens_seen": 65866216672,
|
| 22373 |
+
"step": 125650
|
| 22374 |
+
},
|
| 22375 |
+
{
|
| 22376 |
+
"epoch": 1.199186710709899,
|
| 22377 |
+
"grad_norm": 0.12740108370780945,
|
| 22378 |
+
"learning_rate": 0.000516826782793897,
|
| 22379 |
+
"loss": 2.0607,
|
| 22380 |
+
"num_input_tokens_seen": 65892430944,
|
| 22381 |
+
"step": 125700
|
| 22382 |
+
},
|
| 22383 |
+
{
|
| 22384 |
+
"epoch": 1.1996637132260872,
|
| 22385 |
+
"grad_norm": 0.13575108349323273,
|
| 22386 |
+
"learning_rate": 0.0005140231281379345,
|
| 22387 |
+
"loss": 2.0555,
|
| 22388 |
+
"num_input_tokens_seen": 65918642496,
|
| 22389 |
+
"step": 125750
|
| 22390 |
+
},
|
| 22391 |
+
{
|
| 22392 |
+
"epoch": 1.2001407157422754,
|
| 22393 |
+
"grad_norm": 0.13791455328464508,
|
| 22394 |
+
"learning_rate": 0.0005112190321479025,
|
| 22395 |
+
"loss": 2.0632,
|
| 22396 |
+
"num_input_tokens_seen": 65944852960,
|
| 22397 |
+
"step": 125800
|
| 22398 |
+
},
|
| 22399 |
+
{
|
| 22400 |
+
"epoch": 1.2006177182584639,
|
| 22401 |
+
"grad_norm": 0.1315431296825409,
|
| 22402 |
+
"learning_rate": 0.0005084145830739461,
|
| 22403 |
+
"loss": 2.0646,
|
| 22404 |
+
"num_input_tokens_seen": 65971066432,
|
| 22405 |
+
"step": 125850
|
| 22406 |
+
},
|
| 22407 |
+
{
|
| 22408 |
+
"epoch": 1.201094720774652,
|
| 22409 |
+
"grad_norm": 0.12288303673267365,
|
| 22410 |
+
"learning_rate": 0.000505609869177323,
|
| 22411 |
+
"loss": 2.0748,
|
| 22412 |
+
"num_input_tokens_seen": 65997277888,
|
| 22413 |
+
"step": 125900
|
| 22414 |
+
},
|
| 22415 |
+
{
|
| 22416 |
+
"epoch": 1.2015717232908403,
|
| 22417 |
+
"grad_norm": 0.12677106261253357,
|
| 22418 |
+
"learning_rate": 0.0005028049787276249,
|
| 22419 |
+
"loss": 2.0595,
|
| 22420 |
+
"num_input_tokens_seen": 66023480960,
|
| 22421 |
+
"step": 125950
|
| 22422 |
+
},
|
| 22423 |
+
{
|
| 22424 |
+
"epoch": 1.2020487258070287,
|
| 22425 |
+
"grad_norm": 0.140994593501091,
|
| 22426 |
+
"learning_rate": 0.0005,
|
| 22427 |
+
"loss": 2.0556,
|
| 22428 |
+
"num_input_tokens_seen": 66049692768,
|
| 22429 |
+
"step": 126000
|
| 22430 |
+
},
|
| 22431 |
+
{
|
| 22432 |
+
"epoch": 1.2020487258070287,
|
| 22433 |
+
"eval_loss": 1.978381633758545,
|
| 22434 |
+
"eval_runtime": 81.8164,
|
| 22435 |
+
"eval_samples_per_second": 61.112,
|
| 22436 |
+
"eval_steps_per_second": 15.278,
|
| 22437 |
+
"num_input_tokens_seen": 66049692768,
|
| 22438 |
+
"step": 126000
|
| 22439 |
}
|
| 22440 |
],
|
| 22441 |
"logging_steps": 50,
|
| 22442 |
"max_steps": 140000,
|
| 22443 |
+
"num_input_tokens_seen": 66049692768,
|
| 22444 |
"num_train_epochs": 2,
|
| 22445 |
"save_steps": 1000,
|
| 22446 |
"stateful_callbacks": {
|
|
|
|
| 22455 |
"attributes": {}
|
| 22456 |
}
|
| 22457 |
},
|
| 22458 |
+
"total_flos": 1.1689590873539912e+20,
|
| 22459 |
"train_batch_size": 32,
|
| 22460 |
"trial_name": null,
|
| 22461 |
"trial_params": null
|