Training in progress, step 71000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:757633efe84a53c5ec97a90a7f4675f908dbeafb070171c08276f4ceae89bf82
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c319382408e536debfaba9985144c2b85aedc267f1adb41fa2fcd682a710d69
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b55180ad5c333f626bc6ef839beda747e8f0633fdb8a2329d1af0642155fcad0
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fb9e669a1e66d6084675ac17f9361f1d66f6538870dda5d62bb9fedf0717021
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93c86e46203b6a91184b0093d776c5c5cbb5568a55f409f62928f5b11605d793
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57649dd5fae41007b8326ad8bceda3664e8263c16462c398827f7c60518777a9
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:065fc078fd1aeeb645695c18fb1eff98c533b26302779a57f06b17d1e0565e6a
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24508,6 +24508,356 @@
|
|
| 24508 |
"learning_rate": 0.000482837447795457,
|
| 24509 |
"loss": 16.578,
|
| 24510 |
"step": 70000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24511 |
}
|
| 24512 |
],
|
| 24513 |
"logging_steps": 20,
|
|
@@ -24527,7 +24877,7 @@
|
|
| 24527 |
"attributes": {}
|
| 24528 |
}
|
| 24529 |
},
|
| 24530 |
-
"total_flos": 5.
|
| 24531 |
"train_batch_size": 48,
|
| 24532 |
"trial_name": null,
|
| 24533 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.10517334344577499,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 71000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24508 |
"learning_rate": 0.000482837447795457,
|
| 24509 |
"loss": 16.578,
|
| 24510 |
"step": 70000
|
| 24511 |
+
},
|
| 24512 |
+
{
|
| 24513 |
+
"epoch": 0.10372165504328401,
|
| 24514 |
+
"grad_norm": 7.25,
|
| 24515 |
+
"learning_rate": 0.0004828325088604494,
|
| 24516 |
+
"loss": 16.562,
|
| 24517 |
+
"step": 70020
|
| 24518 |
+
},
|
| 24519 |
+
{
|
| 24520 |
+
"epoch": 0.1037512813372124,
|
| 24521 |
+
"grad_norm": 6.625,
|
| 24522 |
+
"learning_rate": 0.00048282756992544187,
|
| 24523 |
+
"loss": 16.5095,
|
| 24524 |
+
"step": 70040
|
| 24525 |
+
},
|
| 24526 |
+
{
|
| 24527 |
+
"epoch": 0.10378090763114078,
|
| 24528 |
+
"grad_norm": 6.09375,
|
| 24529 |
+
"learning_rate": 0.00048282263099043427,
|
| 24530 |
+
"loss": 16.5207,
|
| 24531 |
+
"step": 70060
|
| 24532 |
+
},
|
| 24533 |
+
{
|
| 24534 |
+
"epoch": 0.10381053392506917,
|
| 24535 |
+
"grad_norm": 6.625,
|
| 24536 |
+
"learning_rate": 0.0004828176920554267,
|
| 24537 |
+
"loss": 16.5471,
|
| 24538 |
+
"step": 70080
|
| 24539 |
+
},
|
| 24540 |
+
{
|
| 24541 |
+
"epoch": 0.10384016021899757,
|
| 24542 |
+
"grad_norm": 6.6875,
|
| 24543 |
+
"learning_rate": 0.00048281275312041916,
|
| 24544 |
+
"loss": 16.524,
|
| 24545 |
+
"step": 70100
|
| 24546 |
+
},
|
| 24547 |
+
{
|
| 24548 |
+
"epoch": 0.10386978651292596,
|
| 24549 |
+
"grad_norm": 6.1875,
|
| 24550 |
+
"learning_rate": 0.0004828078141854116,
|
| 24551 |
+
"loss": 16.4794,
|
| 24552 |
+
"step": 70120
|
| 24553 |
+
},
|
| 24554 |
+
{
|
| 24555 |
+
"epoch": 0.10389941280685434,
|
| 24556 |
+
"grad_norm": 6.75,
|
| 24557 |
+
"learning_rate": 0.000482802875250404,
|
| 24558 |
+
"loss": 16.5229,
|
| 24559 |
+
"step": 70140
|
| 24560 |
+
},
|
| 24561 |
+
{
|
| 24562 |
+
"epoch": 0.10392903910078273,
|
| 24563 |
+
"grad_norm": 6.125,
|
| 24564 |
+
"learning_rate": 0.00048279793631539645,
|
| 24565 |
+
"loss": 16.5296,
|
| 24566 |
+
"step": 70160
|
| 24567 |
+
},
|
| 24568 |
+
{
|
| 24569 |
+
"epoch": 0.10395866539471112,
|
| 24570 |
+
"grad_norm": 6.59375,
|
| 24571 |
+
"learning_rate": 0.0004827929973803889,
|
| 24572 |
+
"loss": 16.5783,
|
| 24573 |
+
"step": 70180
|
| 24574 |
+
},
|
| 24575 |
+
{
|
| 24576 |
+
"epoch": 0.1039882916886395,
|
| 24577 |
+
"grad_norm": 6.8125,
|
| 24578 |
+
"learning_rate": 0.0004827880584453813,
|
| 24579 |
+
"loss": 16.5537,
|
| 24580 |
+
"step": 70200
|
| 24581 |
+
},
|
| 24582 |
+
{
|
| 24583 |
+
"epoch": 0.10401791798256789,
|
| 24584 |
+
"grad_norm": 6.75,
|
| 24585 |
+
"learning_rate": 0.00048278311951037374,
|
| 24586 |
+
"loss": 16.5815,
|
| 24587 |
+
"step": 70220
|
| 24588 |
+
},
|
| 24589 |
+
{
|
| 24590 |
+
"epoch": 0.10404754427649628,
|
| 24591 |
+
"grad_norm": 8.6875,
|
| 24592 |
+
"learning_rate": 0.0004827781805753662,
|
| 24593 |
+
"loss": 16.5732,
|
| 24594 |
+
"step": 70240
|
| 24595 |
+
},
|
| 24596 |
+
{
|
| 24597 |
+
"epoch": 0.10407717057042466,
|
| 24598 |
+
"grad_norm": 6.5,
|
| 24599 |
+
"learning_rate": 0.00048277324164035864,
|
| 24600 |
+
"loss": 16.5605,
|
| 24601 |
+
"step": 70260
|
| 24602 |
+
},
|
| 24603 |
+
{
|
| 24604 |
+
"epoch": 0.10410679686435305,
|
| 24605 |
+
"grad_norm": 6.65625,
|
| 24606 |
+
"learning_rate": 0.00048276830270535103,
|
| 24607 |
+
"loss": 16.5224,
|
| 24608 |
+
"step": 70280
|
| 24609 |
+
},
|
| 24610 |
+
{
|
| 24611 |
+
"epoch": 0.10413642315828144,
|
| 24612 |
+
"grad_norm": 7.0625,
|
| 24613 |
+
"learning_rate": 0.0004827633637703435,
|
| 24614 |
+
"loss": 16.5452,
|
| 24615 |
+
"step": 70300
|
| 24616 |
+
},
|
| 24617 |
+
{
|
| 24618 |
+
"epoch": 0.10416604945220982,
|
| 24619 |
+
"grad_norm": 6.71875,
|
| 24620 |
+
"learning_rate": 0.0004827584248353359,
|
| 24621 |
+
"loss": 16.5098,
|
| 24622 |
+
"step": 70320
|
| 24623 |
+
},
|
| 24624 |
+
{
|
| 24625 |
+
"epoch": 0.10419567574613821,
|
| 24626 |
+
"grad_norm": 6.25,
|
| 24627 |
+
"learning_rate": 0.0004827534859003284,
|
| 24628 |
+
"loss": 16.5676,
|
| 24629 |
+
"step": 70340
|
| 24630 |
+
},
|
| 24631 |
+
{
|
| 24632 |
+
"epoch": 0.1042253020400666,
|
| 24633 |
+
"grad_norm": 6.875,
|
| 24634 |
+
"learning_rate": 0.00048274854696532077,
|
| 24635 |
+
"loss": 16.4514,
|
| 24636 |
+
"step": 70360
|
| 24637 |
+
},
|
| 24638 |
+
{
|
| 24639 |
+
"epoch": 0.10425492833399498,
|
| 24640 |
+
"grad_norm": 6.40625,
|
| 24641 |
+
"learning_rate": 0.0004827436080303132,
|
| 24642 |
+
"loss": 16.5532,
|
| 24643 |
+
"step": 70380
|
| 24644 |
+
},
|
| 24645 |
+
{
|
| 24646 |
+
"epoch": 0.10428455462792338,
|
| 24647 |
+
"grad_norm": 6.65625,
|
| 24648 |
+
"learning_rate": 0.00048273866909530566,
|
| 24649 |
+
"loss": 16.5367,
|
| 24650 |
+
"step": 70400
|
| 24651 |
+
},
|
| 24652 |
+
{
|
| 24653 |
+
"epoch": 0.10431418092185177,
|
| 24654 |
+
"grad_norm": 6.15625,
|
| 24655 |
+
"learning_rate": 0.0004827337301602981,
|
| 24656 |
+
"loss": 16.5784,
|
| 24657 |
+
"step": 70420
|
| 24658 |
+
},
|
| 24659 |
+
{
|
| 24660 |
+
"epoch": 0.10434380721578015,
|
| 24661 |
+
"grad_norm": 6.59375,
|
| 24662 |
+
"learning_rate": 0.0004827287912252905,
|
| 24663 |
+
"loss": 16.5085,
|
| 24664 |
+
"step": 70440
|
| 24665 |
+
},
|
| 24666 |
+
{
|
| 24667 |
+
"epoch": 0.10437343350970854,
|
| 24668 |
+
"grad_norm": 6.53125,
|
| 24669 |
+
"learning_rate": 0.00048272385229028295,
|
| 24670 |
+
"loss": 16.5486,
|
| 24671 |
+
"step": 70460
|
| 24672 |
+
},
|
| 24673 |
+
{
|
| 24674 |
+
"epoch": 0.10440305980363693,
|
| 24675 |
+
"grad_norm": 5.75,
|
| 24676 |
+
"learning_rate": 0.0004827189133552754,
|
| 24677 |
+
"loss": 16.4874,
|
| 24678 |
+
"step": 70480
|
| 24679 |
+
},
|
| 24680 |
+
{
|
| 24681 |
+
"epoch": 0.10443268609756531,
|
| 24682 |
+
"grad_norm": 6.84375,
|
| 24683 |
+
"learning_rate": 0.00048271397442026785,
|
| 24684 |
+
"loss": 16.4957,
|
| 24685 |
+
"step": 70500
|
| 24686 |
+
},
|
| 24687 |
+
{
|
| 24688 |
+
"epoch": 0.1044623123914937,
|
| 24689 |
+
"grad_norm": 6.75,
|
| 24690 |
+
"learning_rate": 0.00048270903548526024,
|
| 24691 |
+
"loss": 16.5765,
|
| 24692 |
+
"step": 70520
|
| 24693 |
+
},
|
| 24694 |
+
{
|
| 24695 |
+
"epoch": 0.10449193868542209,
|
| 24696 |
+
"grad_norm": 8.125,
|
| 24697 |
+
"learning_rate": 0.00048270409655025263,
|
| 24698 |
+
"loss": 16.5136,
|
| 24699 |
+
"step": 70540
|
| 24700 |
+
},
|
| 24701 |
+
{
|
| 24702 |
+
"epoch": 0.10452156497935047,
|
| 24703 |
+
"grad_norm": 6.78125,
|
| 24704 |
+
"learning_rate": 0.00048269915761524514,
|
| 24705 |
+
"loss": 16.5299,
|
| 24706 |
+
"step": 70560
|
| 24707 |
+
},
|
| 24708 |
+
{
|
| 24709 |
+
"epoch": 0.10455119127327886,
|
| 24710 |
+
"grad_norm": 6.6875,
|
| 24711 |
+
"learning_rate": 0.00048269421868023753,
|
| 24712 |
+
"loss": 16.4576,
|
| 24713 |
+
"step": 70580
|
| 24714 |
+
},
|
| 24715 |
+
{
|
| 24716 |
+
"epoch": 0.10458081756720725,
|
| 24717 |
+
"grad_norm": 6.84375,
|
| 24718 |
+
"learning_rate": 0.00048268927974523,
|
| 24719 |
+
"loss": 16.4926,
|
| 24720 |
+
"step": 70600
|
| 24721 |
+
},
|
| 24722 |
+
{
|
| 24723 |
+
"epoch": 0.10461044386113563,
|
| 24724 |
+
"grad_norm": 6.75,
|
| 24725 |
+
"learning_rate": 0.0004826843408102224,
|
| 24726 |
+
"loss": 16.4703,
|
| 24727 |
+
"step": 70620
|
| 24728 |
+
},
|
| 24729 |
+
{
|
| 24730 |
+
"epoch": 0.10464007015506402,
|
| 24731 |
+
"grad_norm": 6.96875,
|
| 24732 |
+
"learning_rate": 0.0004826794018752149,
|
| 24733 |
+
"loss": 16.5149,
|
| 24734 |
+
"step": 70640
|
| 24735 |
+
},
|
| 24736 |
+
{
|
| 24737 |
+
"epoch": 0.1046696964489924,
|
| 24738 |
+
"grad_norm": 6.3125,
|
| 24739 |
+
"learning_rate": 0.00048267446294020727,
|
| 24740 |
+
"loss": 16.4792,
|
| 24741 |
+
"step": 70660
|
| 24742 |
+
},
|
| 24743 |
+
{
|
| 24744 |
+
"epoch": 0.10469932274292079,
|
| 24745 |
+
"grad_norm": 6.59375,
|
| 24746 |
+
"learning_rate": 0.0004826695240051997,
|
| 24747 |
+
"loss": 16.5295,
|
| 24748 |
+
"step": 70680
|
| 24749 |
+
},
|
| 24750 |
+
{
|
| 24751 |
+
"epoch": 0.10472894903684918,
|
| 24752 |
+
"grad_norm": 8.125,
|
| 24753 |
+
"learning_rate": 0.00048266458507019216,
|
| 24754 |
+
"loss": 16.5405,
|
| 24755 |
+
"step": 70700
|
| 24756 |
+
},
|
| 24757 |
+
{
|
| 24758 |
+
"epoch": 0.10475857533077758,
|
| 24759 |
+
"grad_norm": 7.90625,
|
| 24760 |
+
"learning_rate": 0.0004826596461351846,
|
| 24761 |
+
"loss": 16.543,
|
| 24762 |
+
"step": 70720
|
| 24763 |
+
},
|
| 24764 |
+
{
|
| 24765 |
+
"epoch": 0.10478820162470596,
|
| 24766 |
+
"grad_norm": 6.3125,
|
| 24767 |
+
"learning_rate": 0.000482654707200177,
|
| 24768 |
+
"loss": 16.5194,
|
| 24769 |
+
"step": 70740
|
| 24770 |
+
},
|
| 24771 |
+
{
|
| 24772 |
+
"epoch": 0.10481782791863435,
|
| 24773 |
+
"grad_norm": 6.75,
|
| 24774 |
+
"learning_rate": 0.00048264976826516945,
|
| 24775 |
+
"loss": 16.5217,
|
| 24776 |
+
"step": 70760
|
| 24777 |
+
},
|
| 24778 |
+
{
|
| 24779 |
+
"epoch": 0.10484745421256274,
|
| 24780 |
+
"grad_norm": 6.09375,
|
| 24781 |
+
"learning_rate": 0.0004826448293301619,
|
| 24782 |
+
"loss": 16.5735,
|
| 24783 |
+
"step": 70780
|
| 24784 |
+
},
|
| 24785 |
+
{
|
| 24786 |
+
"epoch": 0.10487708050649112,
|
| 24787 |
+
"grad_norm": 6.125,
|
| 24788 |
+
"learning_rate": 0.00048263989039515435,
|
| 24789 |
+
"loss": 16.4886,
|
| 24790 |
+
"step": 70800
|
| 24791 |
+
},
|
| 24792 |
+
{
|
| 24793 |
+
"epoch": 0.10490670680041951,
|
| 24794 |
+
"grad_norm": 7.71875,
|
| 24795 |
+
"learning_rate": 0.00048263495146014674,
|
| 24796 |
+
"loss": 16.4874,
|
| 24797 |
+
"step": 70820
|
| 24798 |
+
},
|
| 24799 |
+
{
|
| 24800 |
+
"epoch": 0.1049363330943479,
|
| 24801 |
+
"grad_norm": 6.59375,
|
| 24802 |
+
"learning_rate": 0.00048263001252513913,
|
| 24803 |
+
"loss": 16.5424,
|
| 24804 |
+
"step": 70840
|
| 24805 |
+
},
|
| 24806 |
+
{
|
| 24807 |
+
"epoch": 0.10496595938827628,
|
| 24808 |
+
"grad_norm": 6.84375,
|
| 24809 |
+
"learning_rate": 0.00048262507359013164,
|
| 24810 |
+
"loss": 16.5108,
|
| 24811 |
+
"step": 70860
|
| 24812 |
+
},
|
| 24813 |
+
{
|
| 24814 |
+
"epoch": 0.10499558568220467,
|
| 24815 |
+
"grad_norm": 6.71875,
|
| 24816 |
+
"learning_rate": 0.00048262013465512403,
|
| 24817 |
+
"loss": 16.4778,
|
| 24818 |
+
"step": 70880
|
| 24819 |
+
},
|
| 24820 |
+
{
|
| 24821 |
+
"epoch": 0.10502521197613306,
|
| 24822 |
+
"grad_norm": 7.21875,
|
| 24823 |
+
"learning_rate": 0.0004826151957201165,
|
| 24824 |
+
"loss": 16.4908,
|
| 24825 |
+
"step": 70900
|
| 24826 |
+
},
|
| 24827 |
+
{
|
| 24828 |
+
"epoch": 0.10505483827006144,
|
| 24829 |
+
"grad_norm": 6.8125,
|
| 24830 |
+
"learning_rate": 0.0004826102567851089,
|
| 24831 |
+
"loss": 16.4564,
|
| 24832 |
+
"step": 70920
|
| 24833 |
+
},
|
| 24834 |
+
{
|
| 24835 |
+
"epoch": 0.10508446456398983,
|
| 24836 |
+
"grad_norm": 6.625,
|
| 24837 |
+
"learning_rate": 0.0004826053178501014,
|
| 24838 |
+
"loss": 16.5258,
|
| 24839 |
+
"step": 70940
|
| 24840 |
+
},
|
| 24841 |
+
{
|
| 24842 |
+
"epoch": 0.10511409085791822,
|
| 24843 |
+
"grad_norm": 6.21875,
|
| 24844 |
+
"learning_rate": 0.00048260037891509377,
|
| 24845 |
+
"loss": 16.5403,
|
| 24846 |
+
"step": 70960
|
| 24847 |
+
},
|
| 24848 |
+
{
|
| 24849 |
+
"epoch": 0.1051437171518466,
|
| 24850 |
+
"grad_norm": 7.375,
|
| 24851 |
+
"learning_rate": 0.0004825954399800862,
|
| 24852 |
+
"loss": 16.575,
|
| 24853 |
+
"step": 70980
|
| 24854 |
+
},
|
| 24855 |
+
{
|
| 24856 |
+
"epoch": 0.10517334344577499,
|
| 24857 |
+
"grad_norm": 6.4375,
|
| 24858 |
+
"learning_rate": 0.00048259050104507866,
|
| 24859 |
+
"loss": 16.5599,
|
| 24860 |
+
"step": 71000
|
| 24861 |
}
|
| 24862 |
],
|
| 24863 |
"logging_steps": 20,
|
|
|
|
| 24877 |
"attributes": {}
|
| 24878 |
}
|
| 24879 |
},
|
| 24880 |
+
"total_flos": 5.220171364156257e+19,
|
| 24881 |
"train_batch_size": 48,
|
| 24882 |
"trial_name": null,
|
| 24883 |
"trial_params": null
|