Training in progress, step 122000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81b6085fb8cdb1171b74b00e5808748cf92ce0ddf8ba548a106b9e635e652ce5
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2584540bb683d62bf86744736defc5b1b50bc3492f528f85e121c6574fb37a99
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3efcdbc541e421955fc1801cd719c72805694f44c64389ef735698f77e94dcbf
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd02e3ed8ffd9c6d891f91758bb97fdbe6142d1b35a6390b66d152313f44683b
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -21546,11 +21546,189 @@
|
|
| 21546 |
"eval_steps_per_second": 15.103,
|
| 21547 |
"num_input_tokens_seen": 63428647904,
|
| 21548 |
"step": 121000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21549 |
}
|
| 21550 |
],
|
| 21551 |
"logging_steps": 50,
|
| 21552 |
"max_steps": 140000,
|
| 21553 |
-
"num_input_tokens_seen":
|
| 21554 |
"num_train_epochs": 2,
|
| 21555 |
"save_steps": 1000,
|
| 21556 |
"stateful_callbacks": {
|
|
@@ -21565,7 +21743,7 @@
|
|
| 21565 |
"attributes": {}
|
| 21566 |
}
|
| 21567 |
},
|
| 21568 |
-
"total_flos": 1.
|
| 21569 |
"train_batch_size": 32,
|
| 21570 |
"trial_name": null,
|
| 21571 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1638885245119668,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 122000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 21546 |
"eval_steps_per_second": 15.103,
|
| 21547 |
"num_input_tokens_seen": 63428647904,
|
| 21548 |
"step": 121000
|
| 21549 |
+
},
|
| 21550 |
+
{
|
| 21551 |
+
"epoch": 1.1548254767043895,
|
| 21552 |
+
"grad_norm": 0.1527141034603119,
|
| 21553 |
+
"learning_rate": 0.000763636808879545,
|
| 21554 |
+
"loss": 2.0812,
|
| 21555 |
+
"num_input_tokens_seen": 63454858592,
|
| 21556 |
+
"step": 121050
|
| 21557 |
+
},
|
| 21558 |
+
{
|
| 21559 |
+
"epoch": 1.155302479220578,
|
| 21560 |
+
"grad_norm": 0.14409616589546204,
|
| 21561 |
+
"learning_rate": 0.0007612492823579744,
|
| 21562 |
+
"loss": 2.0757,
|
| 21563 |
+
"num_input_tokens_seen": 63481069536,
|
| 21564 |
+
"step": 121100
|
| 21565 |
+
},
|
| 21566 |
+
{
|
| 21567 |
+
"epoch": 1.1557794817367661,
|
| 21568 |
+
"grad_norm": 0.1311630755662918,
|
| 21569 |
+
"learning_rate": 0.0007588535338328816,
|
| 21570 |
+
"loss": 2.0714,
|
| 21571 |
+
"num_input_tokens_seen": 63507276640,
|
| 21572 |
+
"step": 121150
|
| 21573 |
+
},
|
| 21574 |
+
{
|
| 21575 |
+
"epoch": 1.1562564842529544,
|
| 21576 |
+
"grad_norm": 0.12864112854003906,
|
| 21577 |
+
"learning_rate": 0.0007564496387029531,
|
| 21578 |
+
"loss": 2.0703,
|
| 21579 |
+
"num_input_tokens_seen": 63533491040,
|
| 21580 |
+
"step": 121200
|
| 21581 |
+
},
|
| 21582 |
+
{
|
| 21583 |
+
"epoch": 1.1567334867691428,
|
| 21584 |
+
"grad_norm": 0.1277550309896469,
|
| 21585 |
+
"learning_rate": 0.0007540376726232647,
|
| 21586 |
+
"loss": 2.0833,
|
| 21587 |
+
"num_input_tokens_seen": 63559699712,
|
| 21588 |
+
"step": 121250
|
| 21589 |
+
},
|
| 21590 |
+
{
|
| 21591 |
+
"epoch": 1.157210489285331,
|
| 21592 |
+
"grad_norm": 0.13141444325447083,
|
| 21593 |
+
"learning_rate": 0.0007516177115029001,
|
| 21594 |
+
"loss": 2.0755,
|
| 21595 |
+
"num_input_tokens_seen": 63585905408,
|
| 21596 |
+
"step": 121300
|
| 21597 |
+
},
|
| 21598 |
+
{
|
| 21599 |
+
"epoch": 1.1576874918015192,
|
| 21600 |
+
"grad_norm": 0.13436725735664368,
|
| 21601 |
+
"learning_rate": 0.0007491898315025615,
|
| 21602 |
+
"loss": 2.0716,
|
| 21603 |
+
"num_input_tokens_seen": 63612116704,
|
| 21604 |
+
"step": 121350
|
| 21605 |
+
},
|
| 21606 |
+
{
|
| 21607 |
+
"epoch": 1.1581644943177074,
|
| 21608 |
+
"grad_norm": 0.13668642938137054,
|
| 21609 |
+
"learning_rate": 0.0007467541090321735,
|
| 21610 |
+
"loss": 2.0766,
|
| 21611 |
+
"num_input_tokens_seen": 63638330048,
|
| 21612 |
+
"step": 121400
|
| 21613 |
+
},
|
| 21614 |
+
{
|
| 21615 |
+
"epoch": 1.1586414968338958,
|
| 21616 |
+
"grad_norm": 0.22589260339736938,
|
| 21617 |
+
"learning_rate": 0.0007443106207484776,
|
| 21618 |
+
"loss": 2.0793,
|
| 21619 |
+
"num_input_tokens_seen": 63664542944,
|
| 21620 |
+
"step": 121450
|
| 21621 |
+
},
|
| 21622 |
+
{
|
| 21623 |
+
"epoch": 1.159118499350084,
|
| 21624 |
+
"grad_norm": 0.14154261350631714,
|
| 21625 |
+
"learning_rate": 0.00074185944355262,
|
| 21626 |
+
"loss": 2.0938,
|
| 21627 |
+
"num_input_tokens_seen": 63690757024,
|
| 21628 |
+
"step": 121500
|
| 21629 |
+
},
|
| 21630 |
+
{
|
| 21631 |
+
"epoch": 1.159118499350084,
|
| 21632 |
+
"eval_loss": 1.9929685592651367,
|
| 21633 |
+
"eval_runtime": 82.8366,
|
| 21634 |
+
"eval_samples_per_second": 60.36,
|
| 21635 |
+
"eval_steps_per_second": 15.09,
|
| 21636 |
+
"num_input_tokens_seen": 63690757024,
|
| 21637 |
+
"step": 121500
|
| 21638 |
+
},
|
| 21639 |
+
{
|
| 21640 |
+
"epoch": 1.1595955018662722,
|
| 21641 |
+
"grad_norm": 0.13303405046463013,
|
| 21642 |
+
"learning_rate": 0.0007394006545877314,
|
| 21643 |
+
"loss": 2.078,
|
| 21644 |
+
"num_input_tokens_seen": 63716968288,
|
| 21645 |
+
"step": 121550
|
| 21646 |
+
},
|
| 21647 |
+
{
|
| 21648 |
+
"epoch": 1.1600725043824607,
|
| 21649 |
+
"grad_norm": 0.12762907147407532,
|
| 21650 |
+
"learning_rate": 0.0007369343312364993,
|
| 21651 |
+
"loss": 2.0757,
|
| 21652 |
+
"num_input_tokens_seen": 63743181728,
|
| 21653 |
+
"step": 121600
|
| 21654 |
+
},
|
| 21655 |
+
{
|
| 21656 |
+
"epoch": 1.1605495068986489,
|
| 21657 |
+
"grad_norm": 0.160507932305336,
|
| 21658 |
+
"learning_rate": 0.0007344605511187322,
|
| 21659 |
+
"loss": 2.076,
|
| 21660 |
+
"num_input_tokens_seen": 63769396128,
|
| 21661 |
+
"step": 121650
|
| 21662 |
+
},
|
| 21663 |
+
{
|
| 21664 |
+
"epoch": 1.161026509414837,
|
| 21665 |
+
"grad_norm": 0.14160197973251343,
|
| 21666 |
+
"learning_rate": 0.0007319793920889171,
|
| 21667 |
+
"loss": 2.0762,
|
| 21668 |
+
"num_input_tokens_seen": 63795607296,
|
| 21669 |
+
"step": 121700
|
| 21670 |
+
},
|
| 21671 |
+
{
|
| 21672 |
+
"epoch": 1.1615035119310255,
|
| 21673 |
+
"grad_norm": 0.15858200192451477,
|
| 21674 |
+
"learning_rate": 0.0007294909322337689,
|
| 21675 |
+
"loss": 2.08,
|
| 21676 |
+
"num_input_tokens_seen": 63821818336,
|
| 21677 |
+
"step": 121750
|
| 21678 |
+
},
|
| 21679 |
+
{
|
| 21680 |
+
"epoch": 1.1619805144472137,
|
| 21681 |
+
"grad_norm": 0.13940422236919403,
|
| 21682 |
+
"learning_rate": 0.0007269952498697733,
|
| 21683 |
+
"loss": 2.0816,
|
| 21684 |
+
"num_input_tokens_seen": 63848031552,
|
| 21685 |
+
"step": 121800
|
| 21686 |
+
},
|
| 21687 |
+
{
|
| 21688 |
+
"epoch": 1.162457516963402,
|
| 21689 |
+
"grad_norm": 0.13600219786167145,
|
| 21690 |
+
"learning_rate": 0.0007244924235407223,
|
| 21691 |
+
"loss": 2.0757,
|
| 21692 |
+
"num_input_tokens_seen": 63874245952,
|
| 21693 |
+
"step": 121850
|
| 21694 |
+
},
|
| 21695 |
+
{
|
| 21696 |
+
"epoch": 1.1629345194795904,
|
| 21697 |
+
"grad_norm": 0.14759120345115662,
|
| 21698 |
+
"learning_rate": 0.0007219825320152411,
|
| 21699 |
+
"loss": 2.0883,
|
| 21700 |
+
"num_input_tokens_seen": 63900453792,
|
| 21701 |
+
"step": 121900
|
| 21702 |
+
},
|
| 21703 |
+
{
|
| 21704 |
+
"epoch": 1.1634115219957786,
|
| 21705 |
+
"grad_norm": 0.12860442698001862,
|
| 21706 |
+
"learning_rate": 0.0007194656542843102,
|
| 21707 |
+
"loss": 2.0802,
|
| 21708 |
+
"num_input_tokens_seen": 63926661920,
|
| 21709 |
+
"step": 121950
|
| 21710 |
+
},
|
| 21711 |
+
{
|
| 21712 |
+
"epoch": 1.1638885245119668,
|
| 21713 |
+
"grad_norm": 0.13766394555568695,
|
| 21714 |
+
"learning_rate": 0.0007169418695587791,
|
| 21715 |
+
"loss": 2.072,
|
| 21716 |
+
"num_input_tokens_seen": 63952872768,
|
| 21717 |
+
"step": 122000
|
| 21718 |
+
},
|
| 21719 |
+
{
|
| 21720 |
+
"epoch": 1.1638885245119668,
|
| 21721 |
+
"eval_loss": 1.991066813468933,
|
| 21722 |
+
"eval_runtime": 82.2634,
|
| 21723 |
+
"eval_samples_per_second": 60.78,
|
| 21724 |
+
"eval_steps_per_second": 15.195,
|
| 21725 |
+
"num_input_tokens_seen": 63952872768,
|
| 21726 |
+
"step": 122000
|
| 21727 |
}
|
| 21728 |
],
|
| 21729 |
"logging_steps": 50,
|
| 21730 |
"max_steps": 140000,
|
| 21731 |
+
"num_input_tokens_seen": 63952872768,
|
| 21732 |
"num_train_epochs": 2,
|
| 21733 |
"save_steps": 1000,
|
| 21734 |
"stateful_callbacks": {
|
|
|
|
| 21743 |
"attributes": {}
|
| 21744 |
}
|
| 21745 |
},
|
| 21746 |
+
"total_flos": 1.1318491979536712e+20,
|
| 21747 |
"train_batch_size": 32,
|
| 21748 |
"trial_name": null,
|
| 21749 |
"trial_params": null
|