Training in progress, step 59000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59a706f60964ffe8cd2b221f9a7465c0f56181a98072bee3057047cce8e408cf
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ef2119eabf69c54d09db0a76c3313d847c900937c3e2edb463f3eba3b1000af
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfe4fcebd5141fdf7604535ed8dc60cda464d7e4d084d78ec5c9b7105325f9b5
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e5b084cf754d7494e17fb8efe3747874197d5052ad1bcb013283a3027835137
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10332,11 +10332,189 @@
|
|
| 10332 |
"eval_steps_per_second": 23.346,
|
| 10333 |
"num_input_tokens_seen": 15204352000,
|
| 10334 |
"step": 58000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10335 |
}
|
| 10336 |
],
|
| 10337 |
"logging_steps": 50,
|
| 10338 |
"max_steps": 60000,
|
| 10339 |
-
"num_input_tokens_seen":
|
| 10340 |
"num_train_epochs": 1,
|
| 10341 |
"save_steps": 1000,
|
| 10342 |
"stateful_callbacks": {
|
|
@@ -10351,7 +10529,7 @@
|
|
| 10351 |
"attributes": {}
|
| 10352 |
}
|
| 10353 |
},
|
| 10354 |
-
"total_flos": 4.
|
| 10355 |
"train_batch_size": 64,
|
| 10356 |
"trial_name": null,
|
| 10357 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3968634335749828,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 59000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10332 |
"eval_steps_per_second": 23.346,
|
| 10333 |
"num_input_tokens_seen": 15204352000,
|
| 10334 |
"step": 58000
|
| 10335 |
+
},
|
| 10336 |
+
{
|
| 10337 |
+
"epoch": 0.39047325964453816,
|
| 10338 |
+
"grad_norm": 0.1577194780111313,
|
| 10339 |
+
"learning_rate": 0.00023875071764202561,
|
| 10340 |
+
"loss": 2.9866,
|
| 10341 |
+
"num_input_tokens_seen": 15217459200,
|
| 10342 |
+
"step": 58050
|
| 10343 |
+
},
|
| 10344 |
+
{
|
| 10345 |
+
"epoch": 0.3908095845882458,
|
| 10346 |
+
"grad_norm": 0.1869671791791916,
|
| 10347 |
+
"learning_rate": 0.00022768048249248646,
|
| 10348 |
+
"loss": 2.9973,
|
| 10349 |
+
"num_input_tokens_seen": 15230566400,
|
| 10350 |
+
"step": 58100
|
| 10351 |
+
},
|
| 10352 |
+
{
|
| 10353 |
+
"epoch": 0.3911459095319534,
|
| 10354 |
+
"grad_norm": 0.1568073183298111,
|
| 10355 |
+
"learning_rate": 0.0002167968815375837,
|
| 10356 |
+
"loss": 3.0012,
|
| 10357 |
+
"num_input_tokens_seen": 15243673600,
|
| 10358 |
+
"step": 58150
|
| 10359 |
+
},
|
| 10360 |
+
{
|
| 10361 |
+
"epoch": 0.391482234475661,
|
| 10362 |
+
"grad_norm": 0.15343065559864044,
|
| 10363 |
+
"learning_rate": 0.00020610737385376348,
|
| 10364 |
+
"loss": 2.988,
|
| 10365 |
+
"num_input_tokens_seen": 15256780800,
|
| 10366 |
+
"step": 58200
|
| 10367 |
+
},
|
| 10368 |
+
{
|
| 10369 |
+
"epoch": 0.3918185594193686,
|
| 10370 |
+
"grad_norm": 0.22413235902786255,
|
| 10371 |
+
"learning_rate": 0.00019561928549563967,
|
| 10372 |
+
"loss": 2.993,
|
| 10373 |
+
"num_input_tokens_seen": 15269888000,
|
| 10374 |
+
"step": 58250
|
| 10375 |
+
},
|
| 10376 |
+
{
|
| 10377 |
+
"epoch": 0.3921548843630762,
|
| 10378 |
+
"grad_norm": 0.1807044893503189,
|
| 10379 |
+
"learning_rate": 0.00018533980447508135,
|
| 10380 |
+
"loss": 2.9905,
|
| 10381 |
+
"num_input_tokens_seen": 15282995200,
|
| 10382 |
+
"step": 58300
|
| 10383 |
+
},
|
| 10384 |
+
{
|
| 10385 |
+
"epoch": 0.39249120930678383,
|
| 10386 |
+
"grad_norm": 0.1571112871170044,
|
| 10387 |
+
"learning_rate": 0.00017527597583490823,
|
| 10388 |
+
"loss": 2.9983,
|
| 10389 |
+
"num_input_tokens_seen": 15296102400,
|
| 10390 |
+
"step": 58350
|
| 10391 |
+
},
|
| 10392 |
+
{
|
| 10393 |
+
"epoch": 0.39282753425049144,
|
| 10394 |
+
"grad_norm": 0.16821637749671936,
|
| 10395 |
+
"learning_rate": 0.00016543469682057105,
|
| 10396 |
+
"loss": 2.9966,
|
| 10397 |
+
"num_input_tokens_seen": 15309209600,
|
| 10398 |
+
"step": 58400
|
| 10399 |
+
},
|
| 10400 |
+
{
|
| 10401 |
+
"epoch": 0.39316385919419905,
|
| 10402 |
+
"grad_norm": 0.1497010737657547,
|
| 10403 |
+
"learning_rate": 0.00015582271215312294,
|
| 10404 |
+
"loss": 2.9814,
|
| 10405 |
+
"num_input_tokens_seen": 15322316800,
|
| 10406 |
+
"step": 58450
|
| 10407 |
+
},
|
| 10408 |
+
{
|
| 10409 |
+
"epoch": 0.39350018413790666,
|
| 10410 |
+
"grad_norm": 0.15679225325584412,
|
| 10411 |
+
"learning_rate": 0.00014644660940672628,
|
| 10412 |
+
"loss": 2.9876,
|
| 10413 |
+
"num_input_tokens_seen": 15335424000,
|
| 10414 |
+
"step": 58500
|
| 10415 |
+
},
|
| 10416 |
+
{
|
| 10417 |
+
"epoch": 0.39350018413790666,
|
| 10418 |
+
"eval_loss": 2.8887994289398193,
|
| 10419 |
+
"eval_runtime": 53.8449,
|
| 10420 |
+
"eval_samples_per_second": 92.859,
|
| 10421 |
+
"eval_steps_per_second": 23.215,
|
| 10422 |
+
"num_input_tokens_seen": 15335424000,
|
| 10423 |
+
"step": 58500
|
| 10424 |
+
},
|
| 10425 |
+
{
|
| 10426 |
+
"epoch": 0.39383650908161427,
|
| 10427 |
+
"grad_norm": 0.15169823169708252,
|
| 10428 |
+
"learning_rate": 0.0001373128144938563,
|
| 10429 |
+
"loss": 2.9875,
|
| 10430 |
+
"num_input_tokens_seen": 15348531200,
|
| 10431 |
+
"step": 58550
|
| 10432 |
+
},
|
| 10433 |
+
{
|
| 10434 |
+
"epoch": 0.3941728340253219,
|
| 10435 |
+
"grad_norm": 0.1635347604751587,
|
| 10436 |
+
"learning_rate": 0.00012842758726130281,
|
| 10437 |
+
"loss": 2.9898,
|
| 10438 |
+
"num_input_tokens_seen": 15361638400,
|
| 10439 |
+
"step": 58600
|
| 10440 |
+
},
|
| 10441 |
+
{
|
| 10442 |
+
"epoch": 0.3945091589690295,
|
| 10443 |
+
"grad_norm": 0.15156348049640656,
|
| 10444 |
+
"learning_rate": 0.00011979701719998454,
|
| 10445 |
+
"loss": 2.9977,
|
| 10446 |
+
"num_input_tokens_seen": 15374745600,
|
| 10447 |
+
"step": 58650
|
| 10448 |
+
},
|
| 10449 |
+
{
|
| 10450 |
+
"epoch": 0.3948454839127371,
|
| 10451 |
+
"grad_norm": 0.15710316598415375,
|
| 10452 |
+
"learning_rate": 0.00011142701927151455,
|
| 10453 |
+
"loss": 2.981,
|
| 10454 |
+
"num_input_tokens_seen": 15387852800,
|
| 10455 |
+
"step": 58700
|
| 10456 |
+
},
|
| 10457 |
+
{
|
| 10458 |
+
"epoch": 0.3951818088564447,
|
| 10459 |
+
"grad_norm": 0.2838917374610901,
|
| 10460 |
+
"learning_rate": 0.00010332332985438247,
|
| 10461 |
+
"loss": 2.9909,
|
| 10462 |
+
"num_input_tokens_seen": 15400960000,
|
| 10463 |
+
"step": 58750
|
| 10464 |
+
},
|
| 10465 |
+
{
|
| 10466 |
+
"epoch": 0.3955181338001524,
|
| 10467 |
+
"grad_norm": 0.1509639173746109,
|
| 10468 |
+
"learning_rate": 9.549150281252633e-05,
|
| 10469 |
+
"loss": 2.9851,
|
| 10470 |
+
"num_input_tokens_seen": 15414067200,
|
| 10471 |
+
"step": 58800
|
| 10472 |
+
},
|
| 10473 |
+
{
|
| 10474 |
+
"epoch": 0.39585445874386,
|
| 10475 |
+
"grad_norm": 0.1501421183347702,
|
| 10476 |
+
"learning_rate": 8.793690568899215e-05,
|
| 10477 |
+
"loss": 2.9931,
|
| 10478 |
+
"num_input_tokens_seen": 15427174400,
|
| 10479 |
+
"step": 58850
|
| 10480 |
+
},
|
| 10481 |
+
{
|
| 10482 |
+
"epoch": 0.3961907836875676,
|
| 10483 |
+
"grad_norm": 0.14904147386550903,
|
| 10484 |
+
"learning_rate": 8.066471602728804e-05,
|
| 10485 |
+
"loss": 2.9862,
|
| 10486 |
+
"num_input_tokens_seen": 15440281600,
|
| 10487 |
+
"step": 58900
|
| 10488 |
+
},
|
| 10489 |
+
{
|
| 10490 |
+
"epoch": 0.3965271086312752,
|
| 10491 |
+
"grad_norm": 0.15182824432849884,
|
| 10492 |
+
"learning_rate": 7.367991782295391e-05,
|
| 10493 |
+
"loss": 2.9882,
|
| 10494 |
+
"num_input_tokens_seen": 15453388800,
|
| 10495 |
+
"step": 58950
|
| 10496 |
+
},
|
| 10497 |
+
{
|
| 10498 |
+
"epoch": 0.3968634335749828,
|
| 10499 |
+
"grad_norm": 0.14710576832294464,
|
| 10500 |
+
"learning_rate": 6.698729810778065e-05,
|
| 10501 |
+
"loss": 2.9856,
|
| 10502 |
+
"num_input_tokens_seen": 15466496000,
|
| 10503 |
+
"step": 59000
|
| 10504 |
+
},
|
| 10505 |
+
{
|
| 10506 |
+
"epoch": 0.3968634335749828,
|
| 10507 |
+
"eval_loss": 2.8845956325531006,
|
| 10508 |
+
"eval_runtime": 53.5429,
|
| 10509 |
+
"eval_samples_per_second": 93.383,
|
| 10510 |
+
"eval_steps_per_second": 23.346,
|
| 10511 |
+
"num_input_tokens_seen": 15466496000,
|
| 10512 |
+
"step": 59000
|
| 10513 |
}
|
| 10514 |
],
|
| 10515 |
"logging_steps": 50,
|
| 10516 |
"max_steps": 60000,
|
| 10517 |
+
"num_input_tokens_seen": 15466496000,
|
| 10518 |
"num_train_epochs": 1,
|
| 10519 |
"save_steps": 1000,
|
| 10520 |
"stateful_callbacks": {
|
|
|
|
| 10529 |
"attributes": {}
|
| 10530 |
}
|
| 10531 |
},
|
| 10532 |
+
"total_flos": 4.13743863300096e+18,
|
| 10533 |
"train_batch_size": 64,
|
| 10534 |
"trial_name": null,
|
| 10535 |
"trial_params": null
|