Training in progress, step 60000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c41967e5432db5ed91bc1228a51744d8af764a94e341f801caf2cc8d0b340946
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70cad043527913fd0557530d296a1fe5bc45ca60997f5c855298840644081537
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6a4cb233f004dcf5c1bd7310c625e6acfeb53e49f5aa9a513759dc7631fff0b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9201fef1295387122e53aeeb3fe425d2797e674a7be3dba9faefda446e2071fd
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10510,11 +10510,189 @@
|
|
| 10510 |
"eval_steps_per_second": 23.346,
|
| 10511 |
"num_input_tokens_seen": 15466496000,
|
| 10512 |
"step": 59000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10513 |
}
|
| 10514 |
],
|
| 10515 |
"logging_steps": 50,
|
| 10516 |
"max_steps": 60000,
|
| 10517 |
-
"num_input_tokens_seen":
|
| 10518 |
"num_train_epochs": 1,
|
| 10519 |
"save_steps": 1000,
|
| 10520 |
"stateful_callbacks": {
|
|
@@ -10524,12 +10702,12 @@
|
|
| 10524 |
"should_evaluate": false,
|
| 10525 |
"should_log": false,
|
| 10526 |
"should_save": true,
|
| 10527 |
-
"should_training_stop":
|
| 10528 |
},
|
| 10529 |
"attributes": {}
|
| 10530 |
}
|
| 10531 |
},
|
| 10532 |
-
"total_flos": 4.
|
| 10533 |
"train_batch_size": 64,
|
| 10534 |
"trial_name": null,
|
| 10535 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.40358993244913505,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 60000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10510 |
"eval_steps_per_second": 23.346,
|
| 10511 |
"num_input_tokens_seen": 15466496000,
|
| 10512 |
"step": 59000
|
| 10513 |
+
},
|
| 10514 |
+
{
|
| 10515 |
+
"epoch": 0.39719975851869044,
|
| 10516 |
+
"grad_norm": 0.14572475850582123,
|
| 10517 |
+
"learning_rate": 6.059144366901737e-05,
|
| 10518 |
+
"loss": 2.9861,
|
| 10519 |
+
"num_input_tokens_seen": 15479603200,
|
| 10520 |
+
"step": 59050
|
| 10521 |
+
},
|
| 10522 |
+
{
|
| 10523 |
+
"epoch": 0.39753608346239805,
|
| 10524 |
+
"grad_norm": 0.5027282238006592,
|
| 10525 |
+
"learning_rate": 5.449673790581611e-05,
|
| 10526 |
+
"loss": 2.9773,
|
| 10527 |
+
"num_input_tokens_seen": 15492710400,
|
| 10528 |
+
"step": 59100
|
| 10529 |
+
},
|
| 10530 |
+
{
|
| 10531 |
+
"epoch": 0.39787240840610566,
|
| 10532 |
+
"grad_norm": 0.192597895860672,
|
| 10533 |
+
"learning_rate": 4.87073578250698e-05,
|
| 10534 |
+
"loss": 2.9874,
|
| 10535 |
+
"num_input_tokens_seen": 15505817600,
|
| 10536 |
+
"step": 59150
|
| 10537 |
+
},
|
| 10538 |
+
{
|
| 10539 |
+
"epoch": 0.39820873334981327,
|
| 10540 |
+
"grad_norm": 0.15083667635917664,
|
| 10541 |
+
"learning_rate": 4.322727117869951e-05,
|
| 10542 |
+
"loss": 2.987,
|
| 10543 |
+
"num_input_tokens_seen": 15518924800,
|
| 10544 |
+
"step": 59200
|
| 10545 |
+
},
|
| 10546 |
+
{
|
| 10547 |
+
"epoch": 0.3985450582935209,
|
| 10548 |
+
"grad_norm": 0.14701534807682037,
|
| 10549 |
+
"learning_rate": 3.806023374435663e-05,
|
| 10550 |
+
"loss": 2.9858,
|
| 10551 |
+
"num_input_tokens_seen": 15532032000,
|
| 10552 |
+
"step": 59250
|
| 10553 |
+
},
|
| 10554 |
+
{
|
| 10555 |
+
"epoch": 0.3988813832372285,
|
| 10556 |
+
"grad_norm": 0.145115464925766,
|
| 10557 |
+
"learning_rate": 3.3209786751399184e-05,
|
| 10558 |
+
"loss": 2.9926,
|
| 10559 |
+
"num_input_tokens_seen": 15545139200,
|
| 10560 |
+
"step": 59300
|
| 10561 |
+
},
|
| 10562 |
+
{
|
| 10563 |
+
"epoch": 0.3992177081809361,
|
| 10564 |
+
"grad_norm": 0.15828457474708557,
|
| 10565 |
+
"learning_rate": 2.8679254453910786e-05,
|
| 10566 |
+
"loss": 2.9803,
|
| 10567 |
+
"num_input_tokens_seen": 15558246400,
|
| 10568 |
+
"step": 59350
|
| 10569 |
+
},
|
| 10570 |
+
{
|
| 10571 |
+
"epoch": 0.3995540331246437,
|
| 10572 |
+
"grad_norm": 0.14400678873062134,
|
| 10573 |
+
"learning_rate": 2.4471741852423235e-05,
|
| 10574 |
+
"loss": 2.9701,
|
| 10575 |
+
"num_input_tokens_seen": 15571353600,
|
| 10576 |
+
"step": 59400
|
| 10577 |
+
},
|
| 10578 |
+
{
|
| 10579 |
+
"epoch": 0.3998903580683513,
|
| 10580 |
+
"grad_norm": 0.14925344288349152,
|
| 10581 |
+
"learning_rate": 2.0590132565903473e-05,
|
| 10582 |
+
"loss": 2.989,
|
| 10583 |
+
"num_input_tokens_seen": 15584460800,
|
| 10584 |
+
"step": 59450
|
| 10585 |
+
},
|
| 10586 |
+
{
|
| 10587 |
+
"epoch": 0.40022668301205894,
|
| 10588 |
+
"grad_norm": 0.14081260561943054,
|
| 10589 |
+
"learning_rate": 1.70370868554659e-05,
|
| 10590 |
+
"loss": 2.9824,
|
| 10591 |
+
"num_input_tokens_seen": 15597568000,
|
| 10592 |
+
"step": 59500
|
| 10593 |
+
},
|
| 10594 |
+
{
|
| 10595 |
+
"epoch": 0.40022668301205894,
|
| 10596 |
+
"eval_loss": 2.882228136062622,
|
| 10597 |
+
"eval_runtime": 53.7595,
|
| 10598 |
+
"eval_samples_per_second": 93.007,
|
| 10599 |
+
"eval_steps_per_second": 23.252,
|
| 10600 |
+
"num_input_tokens_seen": 15597568000,
|
| 10601 |
+
"step": 59500
|
| 10602 |
+
},
|
| 10603 |
+
{
|
| 10604 |
+
"epoch": 0.40056300795576655,
|
| 10605 |
+
"grad_norm": 0.13585136830806732,
|
| 10606 |
+
"learning_rate": 1.3815039801161721e-05,
|
| 10607 |
+
"loss": 2.9883,
|
| 10608 |
+
"num_input_tokens_seen": 15610675200,
|
| 10609 |
+
"step": 59550
|
| 10610 |
+
},
|
| 10611 |
+
{
|
| 10612 |
+
"epoch": 0.40089933289947416,
|
| 10613 |
+
"grad_norm": 0.1438748985528946,
|
| 10614 |
+
"learning_rate": 1.0926199633097156e-05,
|
| 10615 |
+
"loss": 2.9781,
|
| 10616 |
+
"num_input_tokens_seen": 15623782400,
|
| 10617 |
+
"step": 59600
|
| 10618 |
+
},
|
| 10619 |
+
{
|
| 10620 |
+
"epoch": 0.40123565784318177,
|
| 10621 |
+
"grad_norm": 0.3345394730567932,
|
| 10622 |
+
"learning_rate": 8.372546218022748e-06,
|
| 10623 |
+
"loss": 2.9869,
|
| 10624 |
+
"num_input_tokens_seen": 15636889600,
|
| 10625 |
+
"step": 59650
|
| 10626 |
+
},
|
| 10627 |
+
{
|
| 10628 |
+
"epoch": 0.4015719827868894,
|
| 10629 |
+
"grad_norm": 0.14581316709518433,
|
| 10630 |
+
"learning_rate": 6.15582970243117e-06,
|
| 10631 |
+
"loss": 2.9882,
|
| 10632 |
+
"num_input_tokens_seen": 15649996800,
|
| 10633 |
+
"step": 59700
|
| 10634 |
+
},
|
| 10635 |
+
{
|
| 10636 |
+
"epoch": 0.401908307730597,
|
| 10637 |
+
"grad_norm": 0.1409323662519455,
|
| 10638 |
+
"learning_rate": 4.277569313094809e-06,
|
| 10639 |
+
"loss": 2.9833,
|
| 10640 |
+
"num_input_tokens_seen": 15663104000,
|
| 10641 |
+
"step": 59750
|
| 10642 |
+
},
|
| 10643 |
+
{
|
| 10644 |
+
"epoch": 0.4022446326743046,
|
| 10645 |
+
"grad_norm": 0.1412041187286377,
|
| 10646 |
+
"learning_rate": 2.739052315863355e-06,
|
| 10647 |
+
"loss": 2.9835,
|
| 10648 |
+
"num_input_tokens_seen": 15676211200,
|
| 10649 |
+
"step": 59800
|
| 10650 |
+
},
|
| 10651 |
+
{
|
| 10652 |
+
"epoch": 0.4025809576180122,
|
| 10653 |
+
"grad_norm": 0.14011850953102112,
|
| 10654 |
+
"learning_rate": 1.541333133436018e-06,
|
| 10655 |
+
"loss": 2.9819,
|
| 10656 |
+
"num_input_tokens_seen": 15689318400,
|
| 10657 |
+
"step": 59850
|
| 10658 |
+
},
|
| 10659 |
+
{
|
| 10660 |
+
"epoch": 0.4029172825617198,
|
| 10661 |
+
"grad_norm": 0.14772015810012817,
|
| 10662 |
+
"learning_rate": 6.852326227130834e-07,
|
| 10663 |
+
"loss": 2.9855,
|
| 10664 |
+
"num_input_tokens_seen": 15702425600,
|
| 10665 |
+
"step": 59900
|
| 10666 |
+
},
|
| 10667 |
+
{
|
| 10668 |
+
"epoch": 0.40325360750542744,
|
| 10669 |
+
"grad_norm": 0.14281156659126282,
|
| 10670 |
+
"learning_rate": 1.7133751222137007e-07,
|
| 10671 |
+
"loss": 2.978,
|
| 10672 |
+
"num_input_tokens_seen": 15715532800,
|
| 10673 |
+
"step": 59950
|
| 10674 |
+
},
|
| 10675 |
+
{
|
| 10676 |
+
"epoch": 0.40358993244913505,
|
| 10677 |
+
"grad_norm": 0.14420129358768463,
|
| 10678 |
+
"learning_rate": 0.0,
|
| 10679 |
+
"loss": 2.9789,
|
| 10680 |
+
"num_input_tokens_seen": 15728640000,
|
| 10681 |
+
"step": 60000
|
| 10682 |
+
},
|
| 10683 |
+
{
|
| 10684 |
+
"epoch": 0.40358993244913505,
|
| 10685 |
+
"eval_loss": 2.8818726539611816,
|
| 10686 |
+
"eval_runtime": 53.5982,
|
| 10687 |
+
"eval_samples_per_second": 93.287,
|
| 10688 |
+
"eval_steps_per_second": 23.322,
|
| 10689 |
+
"num_input_tokens_seen": 15728640000,
|
| 10690 |
+
"step": 60000
|
| 10691 |
}
|
| 10692 |
],
|
| 10693 |
"logging_steps": 50,
|
| 10694 |
"max_steps": 60000,
|
| 10695 |
+
"num_input_tokens_seen": 15728640000,
|
| 10696 |
"num_train_epochs": 1,
|
| 10697 |
"save_steps": 1000,
|
| 10698 |
"stateful_callbacks": {
|
|
|
|
| 10702 |
"should_evaluate": false,
|
| 10703 |
"should_log": false,
|
| 10704 |
"should_save": true,
|
| 10705 |
+
"should_training_stop": true
|
| 10706 |
},
|
| 10707 |
"attributes": {}
|
| 10708 |
}
|
| 10709 |
},
|
| 10710 |
+
"total_flos": 4.2075647115264e+18,
|
| 10711 |
"train_batch_size": 64,
|
| 10712 |
"trial_name": null,
|
| 10713 |
"trial_params": null
|