Training in progress, step 138000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:294d2d3cfce69d5bcc552541aff1b1d0c5c39d6adabe16e718423a5d850f0d32
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57a69c4accd4194b5ef200a371a59ef019db1dfd38dcb87b64dd42832f583b7c
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdb15604f71f08bf635b865cf27878158a353a64f3dcaa6e5902e3e52c7eb375
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d87ee32367beeb896fbea0e404a77621c8cd628a4eb1251b30dc94e06f2eb792
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24394,11 +24394,189 @@
|
|
| 24394 |
"eval_steps_per_second": 15.101,
|
| 24395 |
"num_input_tokens_seen": 71815816608,
|
| 24396 |
"step": 137000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24397 |
}
|
| 24398 |
],
|
| 24399 |
"logging_steps": 50,
|
| 24400 |
"max_steps": 140000,
|
| 24401 |
-
"num_input_tokens_seen":
|
| 24402 |
"num_train_epochs": 2,
|
| 24403 |
"save_steps": 1000,
|
| 24404 |
"stateful_callbacks": {
|
|
@@ -24413,7 +24591,7 @@
|
|
| 24413 |
"attributes": {}
|
| 24414 |
}
|
| 24415 |
},
|
| 24416 |
-
"total_flos": 1.
|
| 24417 |
"train_batch_size": 32,
|
| 24418 |
"trial_name": null,
|
| 24419 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.316529329692214,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 138000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24394 |
"eval_steps_per_second": 15.101,
|
| 24395 |
"num_input_tokens_seen": 71815816608,
|
| 24396 |
"step": 137000
|
| 24397 |
+
},
|
| 24398 |
+
{
|
| 24399 |
+
"epoch": 1.307466281884637,
|
| 24400 |
+
"grad_norm": 0.11646866798400879,
|
| 24401 |
+
"learning_rate": 2.7139335546282283e-05,
|
| 24402 |
+
"loss": 2.0325,
|
| 24403 |
+
"num_input_tokens_seen": 71842030368,
|
| 24404 |
+
"step": 137050
|
| 24405 |
+
},
|
| 24406 |
+
{
|
| 24407 |
+
"epoch": 1.3079432844008252,
|
| 24408 |
+
"grad_norm": 0.10989837348461151,
|
| 24409 |
+
"learning_rate": 2.6235218056235634e-05,
|
| 24410 |
+
"loss": 2.0325,
|
| 24411 |
+
"num_input_tokens_seen": 71868244768,
|
| 24412 |
+
"step": 137100
|
| 24413 |
+
},
|
| 24414 |
+
{
|
| 24415 |
+
"epoch": 1.3084202869170136,
|
| 24416 |
+
"grad_norm": 0.11658209562301636,
|
| 24417 |
+
"learning_rate": 2.5346010829944367e-05,
|
| 24418 |
+
"loss": 2.0289,
|
| 24419 |
+
"num_input_tokens_seen": 71894452160,
|
| 24420 |
+
"step": 137150
|
| 24421 |
+
},
|
| 24422 |
+
{
|
| 24423 |
+
"epoch": 1.3088972894332018,
|
| 24424 |
+
"grad_norm": 0.11487242579460144,
|
| 24425 |
+
"learning_rate": 2.4471741852423235e-05,
|
| 24426 |
+
"loss": 2.0322,
|
| 24427 |
+
"num_input_tokens_seen": 71920664928,
|
| 24428 |
+
"step": 137200
|
| 24429 |
+
},
|
| 24430 |
+
{
|
| 24431 |
+
"epoch": 1.30937429194939,
|
| 24432 |
+
"grad_norm": 0.11544458568096161,
|
| 24433 |
+
"learning_rate": 2.3612438638551835e-05,
|
| 24434 |
+
"loss": 2.0279,
|
| 24435 |
+
"num_input_tokens_seen": 71946876896,
|
| 24436 |
+
"step": 137250
|
| 24437 |
+
},
|
| 24438 |
+
{
|
| 24439 |
+
"epoch": 1.3098512944655782,
|
| 24440 |
+
"grad_norm": 0.11500503867864609,
|
| 24441 |
+
"learning_rate": 2.276812823220964e-05,
|
| 24442 |
+
"loss": 2.0399,
|
| 24443 |
+
"num_input_tokens_seen": 71973091200,
|
| 24444 |
+
"step": 137300
|
| 24445 |
+
},
|
| 24446 |
+
{
|
| 24447 |
+
"epoch": 1.3103282969817664,
|
| 24448 |
+
"grad_norm": 0.11575910449028015,
|
| 24449 |
+
"learning_rate": 2.1938837205424e-05,
|
| 24450 |
+
"loss": 2.0246,
|
| 24451 |
+
"num_input_tokens_seen": 71999300832,
|
| 24452 |
+
"step": 137350
|
| 24453 |
+
},
|
| 24454 |
+
{
|
| 24455 |
+
"epoch": 1.3108052994979549,
|
| 24456 |
+
"grad_norm": 0.1175985336303711,
|
| 24457 |
+
"learning_rate": 2.1124591657534777e-05,
|
| 24458 |
+
"loss": 2.0225,
|
| 24459 |
+
"num_input_tokens_seen": 72025515232,
|
| 24460 |
+
"step": 137400
|
| 24461 |
+
},
|
| 24462 |
+
{
|
| 24463 |
+
"epoch": 1.311282302014143,
|
| 24464 |
+
"grad_norm": 0.11688115447759628,
|
| 24465 |
+
"learning_rate": 2.032541721437209e-05,
|
| 24466 |
+
"loss": 2.024,
|
| 24467 |
+
"num_input_tokens_seen": 72051723040,
|
| 24468 |
+
"step": 137450
|
| 24469 |
+
},
|
| 24470 |
+
{
|
| 24471 |
+
"epoch": 1.3117593045303315,
|
| 24472 |
+
"grad_norm": 0.11419174075126648,
|
| 24473 |
+
"learning_rate": 1.9541339027450256e-05,
|
| 24474 |
+
"loss": 2.0254,
|
| 24475 |
+
"num_input_tokens_seen": 72077935168,
|
| 24476 |
+
"step": 137500
|
| 24477 |
+
},
|
| 24478 |
+
{
|
| 24479 |
+
"epoch": 1.3117593045303315,
|
| 24480 |
+
"eval_loss": 1.951472282409668,
|
| 24481 |
+
"eval_runtime": 83.1149,
|
| 24482 |
+
"eval_samples_per_second": 60.158,
|
| 24483 |
+
"eval_steps_per_second": 15.039,
|
| 24484 |
+
"num_input_tokens_seen": 72077935168,
|
| 24485 |
+
"step": 137500
|
| 24486 |
+
},
|
| 24487 |
+
{
|
| 24488 |
+
"epoch": 1.3122363070465197,
|
| 24489 |
+
"grad_norm": 0.11731937527656555,
|
| 24490 |
+
"learning_rate": 1.8772381773176416e-05,
|
| 24491 |
+
"loss": 2.0368,
|
| 24492 |
+
"num_input_tokens_seen": 72104145664,
|
| 24493 |
+
"step": 137550
|
| 24494 |
+
},
|
| 24495 |
+
{
|
| 24496 |
+
"epoch": 1.312713309562708,
|
| 24497 |
+
"grad_norm": 0.11281976848840714,
|
| 24498 |
+
"learning_rate": 1.801856965207338e-05,
|
| 24499 |
+
"loss": 2.0243,
|
| 24500 |
+
"num_input_tokens_seen": 72130351488,
|
| 24501 |
+
"step": 137600
|
| 24502 |
+
},
|
| 24503 |
+
{
|
| 24504 |
+
"epoch": 1.3131903120788961,
|
| 24505 |
+
"grad_norm": 0.12566816806793213,
|
| 24506 |
+
"learning_rate": 1.7279926388018564e-05,
|
| 24507 |
+
"loss": 2.0266,
|
| 24508 |
+
"num_input_tokens_seen": 72156564000,
|
| 24509 |
+
"step": 137650
|
| 24510 |
+
},
|
| 24511 |
+
{
|
| 24512 |
+
"epoch": 1.3136673145950846,
|
| 24513 |
+
"grad_norm": 0.1202327162027359,
|
| 24514 |
+
"learning_rate": 1.6556475227496815e-05,
|
| 24515 |
+
"loss": 2.0344,
|
| 24516 |
+
"num_input_tokens_seen": 72182768800,
|
| 24517 |
+
"step": 137700
|
| 24518 |
+
},
|
| 24519 |
+
{
|
| 24520 |
+
"epoch": 1.3141443171112728,
|
| 24521 |
+
"grad_norm": 0.11209400743246078,
|
| 24522 |
+
"learning_rate": 1.584823893886933e-05,
|
| 24523 |
+
"loss": 2.0307,
|
| 24524 |
+
"num_input_tokens_seen": 72208977472,
|
| 24525 |
+
"step": 137750
|
| 24526 |
+
},
|
| 24527 |
+
{
|
| 24528 |
+
"epoch": 1.314621319627461,
|
| 24529 |
+
"grad_norm": 0.11281031370162964,
|
| 24530 |
+
"learning_rate": 1.5155239811656562e-05,
|
| 24531 |
+
"loss": 2.0285,
|
| 24532 |
+
"num_input_tokens_seen": 72235186752,
|
| 24533 |
+
"step": 137800
|
| 24534 |
+
},
|
| 24535 |
+
{
|
| 24536 |
+
"epoch": 1.3150983221436494,
|
| 24537 |
+
"grad_norm": 0.11977609992027283,
|
| 24538 |
+
"learning_rate": 1.4477499655837278e-05,
|
| 24539 |
+
"loss": 2.0307,
|
| 24540 |
+
"num_input_tokens_seen": 72261390432,
|
| 24541 |
+
"step": 137850
|
| 24542 |
+
},
|
| 24543 |
+
{
|
| 24544 |
+
"epoch": 1.3155753246598376,
|
| 24545 |
+
"grad_norm": 0.11602313071489334,
|
| 24546 |
+
"learning_rate": 1.3815039801161721e-05,
|
| 24547 |
+
"loss": 2.0272,
|
| 24548 |
+
"num_input_tokens_seen": 72287596960,
|
| 24549 |
+
"step": 137900
|
| 24550 |
+
},
|
| 24551 |
+
{
|
| 24552 |
+
"epoch": 1.3160523271760258,
|
| 24553 |
+
"grad_norm": 0.11629103124141693,
|
| 24554 |
+
"learning_rate": 1.3167881096480372e-05,
|
| 24555 |
+
"loss": 2.0426,
|
| 24556 |
+
"num_input_tokens_seen": 72313806912,
|
| 24557 |
+
"step": 137950
|
| 24558 |
+
},
|
| 24559 |
+
{
|
| 24560 |
+
"epoch": 1.316529329692214,
|
| 24561 |
+
"grad_norm": 0.11337430030107498,
|
| 24562 |
+
"learning_rate": 1.2536043909088191e-05,
|
| 24563 |
+
"loss": 2.0286,
|
| 24564 |
+
"num_input_tokens_seen": 72340003200,
|
| 24565 |
+
"step": 138000
|
| 24566 |
+
},
|
| 24567 |
+
{
|
| 24568 |
+
"epoch": 1.316529329692214,
|
| 24569 |
+
"eval_loss": 1.9512444734573364,
|
| 24570 |
+
"eval_runtime": 82.1325,
|
| 24571 |
+
"eval_samples_per_second": 60.877,
|
| 24572 |
+
"eval_steps_per_second": 15.219,
|
| 24573 |
+
"num_input_tokens_seen": 72340003200,
|
| 24574 |
+
"step": 138000
|
| 24575 |
}
|
| 24576 |
],
|
| 24577 |
"logging_steps": 50,
|
| 24578 |
"max_steps": 140000,
|
| 24579 |
+
"num_input_tokens_seen": 72340003200,
|
| 24580 |
"num_train_epochs": 2,
|
| 24581 |
"save_steps": 1000,
|
| 24582 |
"stateful_callbacks": {
|
|
|
|
| 24591 |
"attributes": {}
|
| 24592 |
}
|
| 24593 |
},
|
| 24594 |
+
"total_flos": 1.2802861084741632e+20,
|
| 24595 |
"train_batch_size": 32,
|
| 24596 |
"trial_name": null,
|
| 24597 |
"trial_params": null
|