Training in progress, step 28000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2057e4bc4ccb7266894aa681fe099f5645555d35372ed2c2f53abaad870b8285
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49e7c91022600e2317a6a9b8ec33d6b3225250425e275f6eed0bdadc714f7fa6
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f92647ded7f1a6725e7ffd2310a8d2fbafb5da62cf15755b5f3e6fb2fdf499f
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5badfec76e553ebbd712f8d9135dd4df979bf9196652df1ae9ad27ae709e59c4
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fdddb3d61ba5e574c0c975793584282bdce7b095bac6bf2d58912967ca7933b
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3be341579a31269cdfe494164e23b8a4ba61b71f1f432b36a2c0aef7d49c9b92
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18c359f46f82e1c9ecfbab9a4532bc57a1a730dfa02c76c631eb621b98761e8a
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9458,6 +9458,356 @@
|
|
| 9458 |
"learning_rate": 0.0004913851341466507,
|
| 9459 |
"loss": 18.4303,
|
| 9460 |
"step": 27000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9461 |
}
|
| 9462 |
],
|
| 9463 |
"logging_steps": 20,
|
|
@@ -9477,7 +9827,7 @@
|
|
| 9477 |
"attributes": {}
|
| 9478 |
}
|
| 9479 |
},
|
| 9480 |
-
"total_flos":
|
| 9481 |
"train_batch_size": 48,
|
| 9482 |
"trial_name": null,
|
| 9483 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.05459925608513584,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 28000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9458 |
"learning_rate": 0.0004913851341466507,
|
| 9459 |
"loss": 18.4303,
|
| 9460 |
"step": 27000
|
| 9461 |
+
},
|
| 9462 |
+
{
|
| 9463 |
+
"epoch": 0.05268828212215609,
|
| 9464 |
+
"grad_norm": 9.1875,
|
| 9465 |
+
"learning_rate": 0.0004913786321157414,
|
| 9466 |
+
"loss": 18.495,
|
| 9467 |
+
"step": 27020
|
| 9468 |
+
},
|
| 9469 |
+
{
|
| 9470 |
+
"epoch": 0.05272728159078833,
|
| 9471 |
+
"grad_norm": 9.1875,
|
| 9472 |
+
"learning_rate": 0.000491372130084832,
|
| 9473 |
+
"loss": 18.491,
|
| 9474 |
+
"step": 27040
|
| 9475 |
+
},
|
| 9476 |
+
{
|
| 9477 |
+
"epoch": 0.052766281059420564,
|
| 9478 |
+
"grad_norm": 7.90625,
|
| 9479 |
+
"learning_rate": 0.0004913656280539227,
|
| 9480 |
+
"loss": 18.4938,
|
| 9481 |
+
"step": 27060
|
| 9482 |
+
},
|
| 9483 |
+
{
|
| 9484 |
+
"epoch": 0.052805280528052806,
|
| 9485 |
+
"grad_norm": 8.625,
|
| 9486 |
+
"learning_rate": 0.0004913591260230133,
|
| 9487 |
+
"loss": 18.514,
|
| 9488 |
+
"step": 27080
|
| 9489 |
+
},
|
| 9490 |
+
{
|
| 9491 |
+
"epoch": 0.05284427999668505,
|
| 9492 |
+
"grad_norm": 9.4375,
|
| 9493 |
+
"learning_rate": 0.000491352623992104,
|
| 9494 |
+
"loss": 18.4142,
|
| 9495 |
+
"step": 27100
|
| 9496 |
+
},
|
| 9497 |
+
{
|
| 9498 |
+
"epoch": 0.05288327946531728,
|
| 9499 |
+
"grad_norm": 8.375,
|
| 9500 |
+
"learning_rate": 0.0004913461219611947,
|
| 9501 |
+
"loss": 18.5517,
|
| 9502 |
+
"step": 27120
|
| 9503 |
+
},
|
| 9504 |
+
{
|
| 9505 |
+
"epoch": 0.052922278933949525,
|
| 9506 |
+
"grad_norm": 8.25,
|
| 9507 |
+
"learning_rate": 0.0004913396199302852,
|
| 9508 |
+
"loss": 18.4506,
|
| 9509 |
+
"step": 27140
|
| 9510 |
+
},
|
| 9511 |
+
{
|
| 9512 |
+
"epoch": 0.05296127840258177,
|
| 9513 |
+
"grad_norm": 8.9375,
|
| 9514 |
+
"learning_rate": 0.0004913331178993759,
|
| 9515 |
+
"loss": 18.4626,
|
| 9516 |
+
"step": 27160
|
| 9517 |
+
},
|
| 9518 |
+
{
|
| 9519 |
+
"epoch": 0.053000277871214,
|
| 9520 |
+
"grad_norm": 8.5,
|
| 9521 |
+
"learning_rate": 0.0004913266158684665,
|
| 9522 |
+
"loss": 18.4563,
|
| 9523 |
+
"step": 27180
|
| 9524 |
+
},
|
| 9525 |
+
{
|
| 9526 |
+
"epoch": 0.053039277339846244,
|
| 9527 |
+
"grad_norm": 7.875,
|
| 9528 |
+
"learning_rate": 0.0004913201138375572,
|
| 9529 |
+
"loss": 18.5159,
|
| 9530 |
+
"step": 27200
|
| 9531 |
+
},
|
| 9532 |
+
{
|
| 9533 |
+
"epoch": 0.053078276808478486,
|
| 9534 |
+
"grad_norm": 8.375,
|
| 9535 |
+
"learning_rate": 0.0004913136118066478,
|
| 9536 |
+
"loss": 18.4415,
|
| 9537 |
+
"step": 27220
|
| 9538 |
+
},
|
| 9539 |
+
{
|
| 9540 |
+
"epoch": 0.05311727627711072,
|
| 9541 |
+
"grad_norm": 9.1875,
|
| 9542 |
+
"learning_rate": 0.0004913071097757385,
|
| 9543 |
+
"loss": 18.4588,
|
| 9544 |
+
"step": 27240
|
| 9545 |
+
},
|
| 9546 |
+
{
|
| 9547 |
+
"epoch": 0.05315627574574296,
|
| 9548 |
+
"grad_norm": 9.6875,
|
| 9549 |
+
"learning_rate": 0.0004913006077448291,
|
| 9550 |
+
"loss": 18.508,
|
| 9551 |
+
"step": 27260
|
| 9552 |
+
},
|
| 9553 |
+
{
|
| 9554 |
+
"epoch": 0.053195275214375205,
|
| 9555 |
+
"grad_norm": 8.25,
|
| 9556 |
+
"learning_rate": 0.0004912941057139198,
|
| 9557 |
+
"loss": 18.419,
|
| 9558 |
+
"step": 27280
|
| 9559 |
+
},
|
| 9560 |
+
{
|
| 9561 |
+
"epoch": 0.05323427468300745,
|
| 9562 |
+
"grad_norm": 9.5,
|
| 9563 |
+
"learning_rate": 0.0004912876036830103,
|
| 9564 |
+
"loss": 18.3969,
|
| 9565 |
+
"step": 27300
|
| 9566 |
+
},
|
| 9567 |
+
{
|
| 9568 |
+
"epoch": 0.05327327415163968,
|
| 9569 |
+
"grad_norm": 8.5625,
|
| 9570 |
+
"learning_rate": 0.000491281101652101,
|
| 9571 |
+
"loss": 18.4816,
|
| 9572 |
+
"step": 27320
|
| 9573 |
+
},
|
| 9574 |
+
{
|
| 9575 |
+
"epoch": 0.053312273620271924,
|
| 9576 |
+
"grad_norm": 8.875,
|
| 9577 |
+
"learning_rate": 0.0004912745996211917,
|
| 9578 |
+
"loss": 18.4631,
|
| 9579 |
+
"step": 27340
|
| 9580 |
+
},
|
| 9581 |
+
{
|
| 9582 |
+
"epoch": 0.053351273088904166,
|
| 9583 |
+
"grad_norm": 9.1875,
|
| 9584 |
+
"learning_rate": 0.0004912680975902823,
|
| 9585 |
+
"loss": 18.3853,
|
| 9586 |
+
"step": 27360
|
| 9587 |
+
},
|
| 9588 |
+
{
|
| 9589 |
+
"epoch": 0.0533902725575364,
|
| 9590 |
+
"grad_norm": 8.9375,
|
| 9591 |
+
"learning_rate": 0.000491261595559373,
|
| 9592 |
+
"loss": 18.4279,
|
| 9593 |
+
"step": 27380
|
| 9594 |
+
},
|
| 9595 |
+
{
|
| 9596 |
+
"epoch": 0.053429272026168644,
|
| 9597 |
+
"grad_norm": 9.3125,
|
| 9598 |
+
"learning_rate": 0.0004912550935284636,
|
| 9599 |
+
"loss": 18.415,
|
| 9600 |
+
"step": 27400
|
| 9601 |
+
},
|
| 9602 |
+
{
|
| 9603 |
+
"epoch": 0.053468271494800886,
|
| 9604 |
+
"grad_norm": 8.25,
|
| 9605 |
+
"learning_rate": 0.0004912485914975543,
|
| 9606 |
+
"loss": 18.4297,
|
| 9607 |
+
"step": 27420
|
| 9608 |
+
},
|
| 9609 |
+
{
|
| 9610 |
+
"epoch": 0.05350727096343312,
|
| 9611 |
+
"grad_norm": 9.0,
|
| 9612 |
+
"learning_rate": 0.0004912420894666449,
|
| 9613 |
+
"loss": 18.4647,
|
| 9614 |
+
"step": 27440
|
| 9615 |
+
},
|
| 9616 |
+
{
|
| 9617 |
+
"epoch": 0.05354627043206536,
|
| 9618 |
+
"grad_norm": 9.125,
|
| 9619 |
+
"learning_rate": 0.0004912355874357355,
|
| 9620 |
+
"loss": 18.4773,
|
| 9621 |
+
"step": 27460
|
| 9622 |
+
},
|
| 9623 |
+
{
|
| 9624 |
+
"epoch": 0.053585269900697605,
|
| 9625 |
+
"grad_norm": 8.8125,
|
| 9626 |
+
"learning_rate": 0.0004912290854048262,
|
| 9627 |
+
"loss": 18.4229,
|
| 9628 |
+
"step": 27480
|
| 9629 |
+
},
|
| 9630 |
+
{
|
| 9631 |
+
"epoch": 0.05362426936932984,
|
| 9632 |
+
"grad_norm": 8.75,
|
| 9633 |
+
"learning_rate": 0.0004912225833739168,
|
| 9634 |
+
"loss": 18.4303,
|
| 9635 |
+
"step": 27500
|
| 9636 |
+
},
|
| 9637 |
+
{
|
| 9638 |
+
"epoch": 0.05366326883796208,
|
| 9639 |
+
"grad_norm": 8.9375,
|
| 9640 |
+
"learning_rate": 0.0004912160813430075,
|
| 9641 |
+
"loss": 18.5098,
|
| 9642 |
+
"step": 27520
|
| 9643 |
+
},
|
| 9644 |
+
{
|
| 9645 |
+
"epoch": 0.053702268306594324,
|
| 9646 |
+
"grad_norm": 8.375,
|
| 9647 |
+
"learning_rate": 0.0004912095793120981,
|
| 9648 |
+
"loss": 18.4295,
|
| 9649 |
+
"step": 27540
|
| 9650 |
+
},
|
| 9651 |
+
{
|
| 9652 |
+
"epoch": 0.053741267775226566,
|
| 9653 |
+
"grad_norm": 9.4375,
|
| 9654 |
+
"learning_rate": 0.0004912030772811888,
|
| 9655 |
+
"loss": 18.3608,
|
| 9656 |
+
"step": 27560
|
| 9657 |
+
},
|
| 9658 |
+
{
|
| 9659 |
+
"epoch": 0.0537802672438588,
|
| 9660 |
+
"grad_norm": 9.875,
|
| 9661 |
+
"learning_rate": 0.0004911965752502794,
|
| 9662 |
+
"loss": 18.4168,
|
| 9663 |
+
"step": 27580
|
| 9664 |
+
},
|
| 9665 |
+
{
|
| 9666 |
+
"epoch": 0.05381926671249104,
|
| 9667 |
+
"grad_norm": 8.125,
|
| 9668 |
+
"learning_rate": 0.0004911900732193701,
|
| 9669 |
+
"loss": 18.3512,
|
| 9670 |
+
"step": 27600
|
| 9671 |
+
},
|
| 9672 |
+
{
|
| 9673 |
+
"epoch": 0.053858266181123285,
|
| 9674 |
+
"grad_norm": 8.3125,
|
| 9675 |
+
"learning_rate": 0.0004911835711884607,
|
| 9676 |
+
"loss": 18.3994,
|
| 9677 |
+
"step": 27620
|
| 9678 |
+
},
|
| 9679 |
+
{
|
| 9680 |
+
"epoch": 0.05389726564975552,
|
| 9681 |
+
"grad_norm": 8.625,
|
| 9682 |
+
"learning_rate": 0.0004911770691575514,
|
| 9683 |
+
"loss": 18.3586,
|
| 9684 |
+
"step": 27640
|
| 9685 |
+
},
|
| 9686 |
+
{
|
| 9687 |
+
"epoch": 0.05393626511838776,
|
| 9688 |
+
"grad_norm": 8.375,
|
| 9689 |
+
"learning_rate": 0.0004911705671266421,
|
| 9690 |
+
"loss": 18.3836,
|
| 9691 |
+
"step": 27660
|
| 9692 |
+
},
|
| 9693 |
+
{
|
| 9694 |
+
"epoch": 0.053975264587020004,
|
| 9695 |
+
"grad_norm": 8.625,
|
| 9696 |
+
"learning_rate": 0.0004911640650957326,
|
| 9697 |
+
"loss": 18.366,
|
| 9698 |
+
"step": 27680
|
| 9699 |
+
},
|
| 9700 |
+
{
|
| 9701 |
+
"epoch": 0.05401426405565224,
|
| 9702 |
+
"grad_norm": 9.75,
|
| 9703 |
+
"learning_rate": 0.0004911575630648233,
|
| 9704 |
+
"loss": 18.4281,
|
| 9705 |
+
"step": 27700
|
| 9706 |
+
},
|
| 9707 |
+
{
|
| 9708 |
+
"epoch": 0.05405326352428448,
|
| 9709 |
+
"grad_norm": 8.4375,
|
| 9710 |
+
"learning_rate": 0.0004911510610339139,
|
| 9711 |
+
"loss": 18.3837,
|
| 9712 |
+
"step": 27720
|
| 9713 |
+
},
|
| 9714 |
+
{
|
| 9715 |
+
"epoch": 0.05409226299291672,
|
| 9716 |
+
"grad_norm": 8.1875,
|
| 9717 |
+
"learning_rate": 0.0004911445590030046,
|
| 9718 |
+
"loss": 18.4365,
|
| 9719 |
+
"step": 27740
|
| 9720 |
+
},
|
| 9721 |
+
{
|
| 9722 |
+
"epoch": 0.05413126246154896,
|
| 9723 |
+
"grad_norm": 8.5,
|
| 9724 |
+
"learning_rate": 0.0004911380569720952,
|
| 9725 |
+
"loss": 18.357,
|
| 9726 |
+
"step": 27760
|
| 9727 |
+
},
|
| 9728 |
+
{
|
| 9729 |
+
"epoch": 0.0541702619301812,
|
| 9730 |
+
"grad_norm": 8.0625,
|
| 9731 |
+
"learning_rate": 0.0004911315549411859,
|
| 9732 |
+
"loss": 18.3518,
|
| 9733 |
+
"step": 27780
|
| 9734 |
+
},
|
| 9735 |
+
{
|
| 9736 |
+
"epoch": 0.05420926139881344,
|
| 9737 |
+
"grad_norm": 9.6875,
|
| 9738 |
+
"learning_rate": 0.0004911250529102766,
|
| 9739 |
+
"loss": 18.421,
|
| 9740 |
+
"step": 27800
|
| 9741 |
+
},
|
| 9742 |
+
{
|
| 9743 |
+
"epoch": 0.054248260867445684,
|
| 9744 |
+
"grad_norm": 10.0,
|
| 9745 |
+
"learning_rate": 0.0004911185508793672,
|
| 9746 |
+
"loss": 18.3555,
|
| 9747 |
+
"step": 27820
|
| 9748 |
+
},
|
| 9749 |
+
{
|
| 9750 |
+
"epoch": 0.05428726033607792,
|
| 9751 |
+
"grad_norm": 7.84375,
|
| 9752 |
+
"learning_rate": 0.0004911120488484579,
|
| 9753 |
+
"loss": 18.3645,
|
| 9754 |
+
"step": 27840
|
| 9755 |
+
},
|
| 9756 |
+
{
|
| 9757 |
+
"epoch": 0.05432625980471016,
|
| 9758 |
+
"grad_norm": 8.5625,
|
| 9759 |
+
"learning_rate": 0.0004911055468175485,
|
| 9760 |
+
"loss": 18.3514,
|
| 9761 |
+
"step": 27860
|
| 9762 |
+
},
|
| 9763 |
+
{
|
| 9764 |
+
"epoch": 0.0543652592733424,
|
| 9765 |
+
"grad_norm": 7.59375,
|
| 9766 |
+
"learning_rate": 0.0004910990447866392,
|
| 9767 |
+
"loss": 18.4433,
|
| 9768 |
+
"step": 27880
|
| 9769 |
+
},
|
| 9770 |
+
{
|
| 9771 |
+
"epoch": 0.05440425874197464,
|
| 9772 |
+
"grad_norm": 8.0625,
|
| 9773 |
+
"learning_rate": 0.0004910925427557298,
|
| 9774 |
+
"loss": 18.3955,
|
| 9775 |
+
"step": 27900
|
| 9776 |
+
},
|
| 9777 |
+
{
|
| 9778 |
+
"epoch": 0.05444325821060688,
|
| 9779 |
+
"grad_norm": 7.65625,
|
| 9780 |
+
"learning_rate": 0.0004910860407248204,
|
| 9781 |
+
"loss": 18.3803,
|
| 9782 |
+
"step": 27920
|
| 9783 |
+
},
|
| 9784 |
+
{
|
| 9785 |
+
"epoch": 0.05448225767923912,
|
| 9786 |
+
"grad_norm": 8.9375,
|
| 9787 |
+
"learning_rate": 0.000491079538693911,
|
| 9788 |
+
"loss": 18.4133,
|
| 9789 |
+
"step": 27940
|
| 9790 |
+
},
|
| 9791 |
+
{
|
| 9792 |
+
"epoch": 0.05452125714787136,
|
| 9793 |
+
"grad_norm": 8.375,
|
| 9794 |
+
"learning_rate": 0.0004910730366630017,
|
| 9795 |
+
"loss": 18.3317,
|
| 9796 |
+
"step": 27960
|
| 9797 |
+
},
|
| 9798 |
+
{
|
| 9799 |
+
"epoch": 0.0545602566165036,
|
| 9800 |
+
"grad_norm": 8.9375,
|
| 9801 |
+
"learning_rate": 0.0004910665346320924,
|
| 9802 |
+
"loss": 18.3971,
|
| 9803 |
+
"step": 27980
|
| 9804 |
+
},
|
| 9805 |
+
{
|
| 9806 |
+
"epoch": 0.05459925608513584,
|
| 9807 |
+
"grad_norm": 9.3125,
|
| 9808 |
+
"learning_rate": 0.000491060032601183,
|
| 9809 |
+
"loss": 18.3958,
|
| 9810 |
+
"step": 28000
|
| 9811 |
}
|
| 9812 |
],
|
| 9813 |
"logging_steps": 20,
|
|
|
|
| 9827 |
"attributes": {}
|
| 9828 |
}
|
| 9829 |
},
|
| 9830 |
+
"total_flos": 2.058460925948802e+19,
|
| 9831 |
"train_batch_size": 48,
|
| 9832 |
"trial_name": null,
|
| 9833 |
"trial_params": null
|