Training in progress, step 116000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88eb3f94bc7241f618e5c9770b54c115b258d914f67d481780ad17863ab32c2e
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ead16386e0cfae3ee1c925e0e05a55f093ed2c84207e3beb26950f24f2d0edd3
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:315a996739a8cfadd830b0d25c5fc7336620692744591af847d9b45121986328
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f9c807b963b46c441b7e935adcacbb554bdd0c85992b7453ee29eed159b81fb
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -20478,11 +20478,189 @@
|
|
| 20478 |
"eval_steps_per_second": 15.153,
|
| 20479 |
"num_input_tokens_seen": 60283464768,
|
| 20480 |
"step": 115000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20481 |
}
|
| 20482 |
],
|
| 20483 |
"logging_steps": 50,
|
| 20484 |
"max_steps": 140000,
|
| 20485 |
-
"num_input_tokens_seen":
|
| 20486 |
"num_train_epochs": 2,
|
| 20487 |
"save_steps": 1000,
|
| 20488 |
"stateful_callbacks": {
|
|
@@ -20497,7 +20675,7 @@
|
|
| 20497 |
"attributes": {}
|
| 20498 |
}
|
| 20499 |
},
|
| 20500 |
-
"total_flos": 1.
|
| 20501 |
"train_batch_size": 32,
|
| 20502 |
"trial_name": null,
|
| 20503 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1066482225693741,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 116000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 20478 |
"eval_steps_per_second": 15.153,
|
| 20479 |
"num_input_tokens_seen": 60283464768,
|
| 20480 |
"step": 115000
|
| 20481 |
+
},
|
| 20482 |
+
{
|
| 20483 |
+
"epoch": 1.0975851747617968,
|
| 20484 |
+
"grad_norm": 0.14440514147281647,
|
| 20485 |
+
"learning_rate": 0.0009710078129677895,
|
| 20486 |
+
"loss": 2.0927,
|
| 20487 |
+
"num_input_tokens_seen": 60309676352,
|
| 20488 |
+
"step": 115050
|
| 20489 |
+
},
|
| 20490 |
+
{
|
| 20491 |
+
"epoch": 1.098062177277985,
|
| 20492 |
+
"grad_norm": 0.13419468700885773,
|
| 20493 |
+
"learning_rate": 0.0009700591372846095,
|
| 20494 |
+
"loss": 2.0871,
|
| 20495 |
+
"num_input_tokens_seen": 60335889280,
|
| 20496 |
+
"step": 115100
|
| 20497 |
+
},
|
| 20498 |
+
{
|
| 20499 |
+
"epoch": 1.0985391797941735,
|
| 20500 |
+
"grad_norm": 0.14434845745563507,
|
| 20501 |
+
"learning_rate": 0.0009690956679612422,
|
| 20502 |
+
"loss": 2.0823,
|
| 20503 |
+
"num_input_tokens_seen": 60362096256,
|
| 20504 |
+
"step": 115150
|
| 20505 |
+
},
|
| 20506 |
+
{
|
| 20507 |
+
"epoch": 1.0990161823103617,
|
| 20508 |
+
"grad_norm": 0.14158272743225098,
|
| 20509 |
+
"learning_rate": 0.0009681174353198686,
|
| 20510 |
+
"loss": 2.0932,
|
| 20511 |
+
"num_input_tokens_seen": 60388308192,
|
| 20512 |
+
"step": 115200
|
| 20513 |
+
},
|
| 20514 |
+
{
|
| 20515 |
+
"epoch": 1.09949318482655,
|
| 20516 |
+
"grad_norm": 0.1499590128660202,
|
| 20517 |
+
"learning_rate": 0.0009671244701472999,
|
| 20518 |
+
"loss": 2.0901,
|
| 20519 |
+
"num_input_tokens_seen": 60414516160,
|
| 20520 |
+
"step": 115250
|
| 20521 |
+
},
|
| 20522 |
+
{
|
| 20523 |
+
"epoch": 1.0999701873427383,
|
| 20524 |
+
"grad_norm": 0.13877320289611816,
|
| 20525 |
+
"learning_rate": 0.0009661168036940071,
|
| 20526 |
+
"loss": 2.0915,
|
| 20527 |
+
"num_input_tokens_seen": 60440722624,
|
| 20528 |
+
"step": 115300
|
| 20529 |
+
},
|
| 20530 |
+
{
|
| 20531 |
+
"epoch": 1.1004471898589265,
|
| 20532 |
+
"grad_norm": 0.14336808025836945,
|
| 20533 |
+
"learning_rate": 0.0009650944676731382,
|
| 20534 |
+
"loss": 2.0846,
|
| 20535 |
+
"num_input_tokens_seen": 60466923616,
|
| 20536 |
+
"step": 115350
|
| 20537 |
+
},
|
| 20538 |
+
{
|
| 20539 |
+
"epoch": 1.1009241923751147,
|
| 20540 |
+
"grad_norm": 0.16042272746562958,
|
| 20541 |
+
"learning_rate": 0.0009640574942595195,
|
| 20542 |
+
"loss": 2.0942,
|
| 20543 |
+
"num_input_tokens_seen": 60493123456,
|
| 20544 |
+
"step": 115400
|
| 20545 |
+
},
|
| 20546 |
+
{
|
| 20547 |
+
"epoch": 1.101401194891303,
|
| 20548 |
+
"grad_norm": 0.14399364590644836,
|
| 20549 |
+
"learning_rate": 0.0009630059160886439,
|
| 20550 |
+
"loss": 2.0988,
|
| 20551 |
+
"num_input_tokens_seen": 60519323040,
|
| 20552 |
+
"step": 115450
|
| 20553 |
+
},
|
| 20554 |
+
{
|
| 20555 |
+
"epoch": 1.1018781974074914,
|
| 20556 |
+
"grad_norm": 0.14042776823043823,
|
| 20557 |
+
"learning_rate": 0.0009619397662556434,
|
| 20558 |
+
"loss": 2.0916,
|
| 20559 |
+
"num_input_tokens_seen": 60545534656,
|
| 20560 |
+
"step": 115500
|
| 20561 |
+
},
|
| 20562 |
+
{
|
| 20563 |
+
"epoch": 1.1018781974074914,
|
| 20564 |
+
"eval_loss": 2.0105109214782715,
|
| 20565 |
+
"eval_runtime": 82.3145,
|
| 20566 |
+
"eval_samples_per_second": 60.743,
|
| 20567 |
+
"eval_steps_per_second": 15.186,
|
| 20568 |
+
"num_input_tokens_seen": 60545534656,
|
| 20569 |
+
"step": 115500
|
| 20570 |
+
},
|
| 20571 |
+
{
|
| 20572 |
+
"epoch": 1.1023551999236796,
|
| 20573 |
+
"grad_norm": 0.1399744153022766,
|
| 20574 |
+
"learning_rate": 0.000960859078314247,
|
| 20575 |
+
"loss": 2.096,
|
| 20576 |
+
"num_input_tokens_seen": 60571738272,
|
| 20577 |
+
"step": 115550
|
| 20578 |
+
},
|
| 20579 |
+
{
|
| 20580 |
+
"epoch": 1.1028322024398678,
|
| 20581 |
+
"grad_norm": 0.14161787927150726,
|
| 20582 |
+
"learning_rate": 0.0009597638862757254,
|
| 20583 |
+
"loss": 2.0916,
|
| 20584 |
+
"num_input_tokens_seen": 60597952672,
|
| 20585 |
+
"step": 115600
|
| 20586 |
+
},
|
| 20587 |
+
{
|
| 20588 |
+
"epoch": 1.1033092049560562,
|
| 20589 |
+
"grad_norm": 0.14088015258312225,
|
| 20590 |
+
"learning_rate": 0.0009586542246078203,
|
| 20591 |
+
"loss": 2.0856,
|
| 20592 |
+
"num_input_tokens_seen": 60624155648,
|
| 20593 |
+
"step": 115650
|
| 20594 |
+
},
|
| 20595 |
+
{
|
| 20596 |
+
"epoch": 1.1037862074722444,
|
| 20597 |
+
"grad_norm": 0.13098938763141632,
|
| 20598 |
+
"learning_rate": 0.00095753012823366,
|
| 20599 |
+
"loss": 2.0849,
|
| 20600 |
+
"num_input_tokens_seen": 60650370048,
|
| 20601 |
+
"step": 115700
|
| 20602 |
+
},
|
| 20603 |
+
{
|
| 20604 |
+
"epoch": 1.1042632099884326,
|
| 20605 |
+
"grad_norm": 0.14463865756988525,
|
| 20606 |
+
"learning_rate": 0.0009563916325306594,
|
| 20607 |
+
"loss": 2.0918,
|
| 20608 |
+
"num_input_tokens_seen": 60676580928,
|
| 20609 |
+
"step": 115750
|
| 20610 |
+
},
|
| 20611 |
+
{
|
| 20612 |
+
"epoch": 1.104740212504621,
|
| 20613 |
+
"grad_norm": 0.14490677416324615,
|
| 20614 |
+
"learning_rate": 0.000955238773329408,
|
| 20615 |
+
"loss": 2.0996,
|
| 20616 |
+
"num_input_tokens_seen": 60702794432,
|
| 20617 |
+
"step": 115800
|
| 20618 |
+
},
|
| 20619 |
+
{
|
| 20620 |
+
"epoch": 1.1052172150208093,
|
| 20621 |
+
"grad_norm": 0.14372467994689941,
|
| 20622 |
+
"learning_rate": 0.0009540715869125407,
|
| 20623 |
+
"loss": 2.09,
|
| 20624 |
+
"num_input_tokens_seen": 60729000064,
|
| 20625 |
+
"step": 115850
|
| 20626 |
+
},
|
| 20627 |
+
{
|
| 20628 |
+
"epoch": 1.1056942175369975,
|
| 20629 |
+
"grad_norm": 0.16468504071235657,
|
| 20630 |
+
"learning_rate": 0.000952890110013597,
|
| 20631 |
+
"loss": 2.0901,
|
| 20632 |
+
"num_input_tokens_seen": 60755212896,
|
| 20633 |
+
"step": 115900
|
| 20634 |
+
},
|
| 20635 |
+
{
|
| 20636 |
+
"epoch": 1.1061712200531857,
|
| 20637 |
+
"grad_norm": 0.390666663646698,
|
| 20638 |
+
"learning_rate": 0.0009516943798158648,
|
| 20639 |
+
"loss": 2.0855,
|
| 20640 |
+
"num_input_tokens_seen": 60781425984,
|
| 20641 |
+
"step": 115950
|
| 20642 |
+
},
|
| 20643 |
+
{
|
| 20644 |
+
"epoch": 1.1066482225693741,
|
| 20645 |
+
"grad_norm": 0.14308005571365356,
|
| 20646 |
+
"learning_rate": 0.0009504844339512095,
|
| 20647 |
+
"loss": 2.1125,
|
| 20648 |
+
"num_input_tokens_seen": 60807636160,
|
| 20649 |
+
"step": 116000
|
| 20650 |
+
},
|
| 20651 |
+
{
|
| 20652 |
+
"epoch": 1.1066482225693741,
|
| 20653 |
+
"eval_loss": 2.0120937824249268,
|
| 20654 |
+
"eval_runtime": 82.7927,
|
| 20655 |
+
"eval_samples_per_second": 60.392,
|
| 20656 |
+
"eval_steps_per_second": 15.098,
|
| 20657 |
+
"num_input_tokens_seen": 60807636160,
|
| 20658 |
+
"step": 116000
|
| 20659 |
}
|
| 20660 |
],
|
| 20661 |
"logging_steps": 50,
|
| 20662 |
"max_steps": 140000,
|
| 20663 |
+
"num_input_tokens_seen": 60807636160,
|
| 20664 |
"num_train_epochs": 2,
|
| 20665 |
"save_steps": 1000,
|
| 20666 |
"stateful_callbacks": {
|
|
|
|
| 20675 |
"attributes": {}
|
| 20676 |
}
|
| 20677 |
},
|
| 20678 |
+
"total_flos": 1.0761842469036442e+20,
|
| 20679 |
"train_batch_size": 32,
|
| 20680 |
"trial_name": null,
|
| 20681 |
"trial_params": null
|