Training in progress, step 14500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step14500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step14500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3576f2655b1ba80d212588b793a4ccc62cae448fb8536ce80c2cb8519f9e8da
|
| 3 |
size 12017472
|
last-checkpoint/global_step14500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb8ddbbfe6677bfeb3dea29b30df97965b929938a7c03ad9eacba0e52ef12377
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step14500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c8de513350d3a396702450256a3434f4f6d8424161c0019906936c1e1f1caa3
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step14500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e877aa0d3a3d9a4fe852642f23daa221d76931700b5fdfe8ba4090a8a19bcbbb
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch": 10.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5617,6 +5617,206 @@
|
|
| 5617 |
"eval_samples_per_second": 43.639,
|
| 5618 |
"eval_steps_per_second": 5.462,
|
| 5619 |
"step": 14000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5620 |
}
|
| 5621 |
],
|
| 5622 |
"logging_steps": 25,
|
|
@@ -5636,7 +5836,7 @@
|
|
| 5636 |
"attributes": {}
|
| 5637 |
}
|
| 5638 |
},
|
| 5639 |
-
"total_flos":
|
| 5640 |
"train_batch_size": 4,
|
| 5641 |
"trial_name": null,
|
| 5642 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 14500,
|
| 3 |
+
"best_metric": 0.5384897589683533,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-14500",
|
| 5 |
+
"epoch": 10.538083984730049,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 14500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5617 |
"eval_samples_per_second": 43.639,
|
| 5618 |
"eval_steps_per_second": 5.462,
|
| 5619 |
"step": 14000
|
| 5620 |
+
},
|
| 5621 |
+
{
|
| 5622 |
+
"epoch": 10.19269223777495,
|
| 5623 |
+
"grad_norm": 0.8508243560791016,
|
| 5624 |
+
"learning_rate": 4.4477516452882655e-06,
|
| 5625 |
+
"loss": 0.5064,
|
| 5626 |
+
"mean_token_accuracy": 0.8425439709424972,
|
| 5627 |
+
"num_tokens": 308806902.0,
|
| 5628 |
+
"step": 14025
|
| 5629 |
+
},
|
| 5630 |
+
{
|
| 5631 |
+
"epoch": 10.210870750772587,
|
| 5632 |
+
"grad_norm": 0.8245001435279846,
|
| 5633 |
+
"learning_rate": 4.360434283160126e-06,
|
| 5634 |
+
"loss": 0.5089,
|
| 5635 |
+
"mean_token_accuracy": 0.8431083789467811,
|
| 5636 |
+
"num_tokens": 309352182.0,
|
| 5637 |
+
"step": 14050
|
| 5638 |
+
},
|
| 5639 |
+
{
|
| 5640 |
+
"epoch": 10.229049263770223,
|
| 5641 |
+
"grad_norm": 0.8090792298316956,
|
| 5642 |
+
"learning_rate": 4.273933105490162e-06,
|
| 5643 |
+
"loss": 0.5123,
|
| 5644 |
+
"mean_token_accuracy": 0.8400397875905037,
|
| 5645 |
+
"num_tokens": 309919307.0,
|
| 5646 |
+
"step": 14075
|
| 5647 |
+
},
|
| 5648 |
+
{
|
| 5649 |
+
"epoch": 10.247227776767861,
|
| 5650 |
+
"grad_norm": 0.9191139936447144,
|
| 5651 |
+
"learning_rate": 4.188250093248547e-06,
|
| 5652 |
+
"loss": 0.5021,
|
| 5653 |
+
"mean_token_accuracy": 0.8438076037168503,
|
| 5654 |
+
"num_tokens": 310468181.0,
|
| 5655 |
+
"step": 14100
|
| 5656 |
+
},
|
| 5657 |
+
{
|
| 5658 |
+
"epoch": 10.265406289765497,
|
| 5659 |
+
"grad_norm": 0.8430826663970947,
|
| 5660 |
+
"learning_rate": 4.103387208668594e-06,
|
| 5661 |
+
"loss": 0.5103,
|
| 5662 |
+
"mean_token_accuracy": 0.8410224625468254,
|
| 5663 |
+
"num_tokens": 311012563.0,
|
| 5664 |
+
"step": 14125
|
| 5665 |
+
},
|
| 5666 |
+
{
|
| 5667 |
+
"epoch": 10.283584802763134,
|
| 5668 |
+
"grad_norm": 0.8337134122848511,
|
| 5669 |
+
"learning_rate": 4.019346395201793e-06,
|
| 5670 |
+
"loss": 0.5059,
|
| 5671 |
+
"mean_token_accuracy": 0.8416058418154716,
|
| 5672 |
+
"num_tokens": 311558333.0,
|
| 5673 |
+
"step": 14150
|
| 5674 |
+
},
|
| 5675 |
+
{
|
| 5676 |
+
"epoch": 10.301763315760772,
|
| 5677 |
+
"grad_norm": 0.8520947694778442,
|
| 5678 |
+
"learning_rate": 3.936129577473344e-06,
|
| 5679 |
+
"loss": 0.5117,
|
| 5680 |
+
"mean_token_accuracy": 0.839869918525219,
|
| 5681 |
+
"num_tokens": 312128294.0,
|
| 5682 |
+
"step": 14175
|
| 5683 |
+
},
|
| 5684 |
+
{
|
| 5685 |
+
"epoch": 10.319941828758408,
|
| 5686 |
+
"grad_norm": 0.8563548922538757,
|
| 5687 |
+
"learning_rate": 3.853738661238042e-06,
|
| 5688 |
+
"loss": 0.5162,
|
| 5689 |
+
"mean_token_accuracy": 0.8394653937220573,
|
| 5690 |
+
"num_tokens": 312689462.0,
|
| 5691 |
+
"step": 14200
|
| 5692 |
+
},
|
| 5693 |
+
{
|
| 5694 |
+
"epoch": 10.338120341756044,
|
| 5695 |
+
"grad_norm": 0.8299646377563477,
|
| 5696 |
+
"learning_rate": 3.7721755333366326e-06,
|
| 5697 |
+
"loss": 0.508,
|
| 5698 |
+
"mean_token_accuracy": 0.8402037498354912,
|
| 5699 |
+
"num_tokens": 313254544.0,
|
| 5700 |
+
"step": 14225
|
| 5701 |
+
},
|
| 5702 |
+
{
|
| 5703 |
+
"epoch": 10.356298854753682,
|
| 5704 |
+
"grad_norm": 0.865742027759552,
|
| 5705 |
+
"learning_rate": 3.691442061652657e-06,
|
| 5706 |
+
"loss": 0.5106,
|
| 5707 |
+
"mean_token_accuracy": 0.8408624231815338,
|
| 5708 |
+
"num_tokens": 313792753.0,
|
| 5709 |
+
"step": 14250
|
| 5710 |
+
},
|
| 5711 |
+
{
|
| 5712 |
+
"epoch": 10.356298854753682,
|
| 5713 |
+
"eval_loss": 0.5386558175086975,
|
| 5714 |
+
"eval_mean_token_accuracy": 0.8323602951040455,
|
| 5715 |
+
"eval_num_tokens": 313792753.0,
|
| 5716 |
+
"eval_runtime": 111.6679,
|
| 5717 |
+
"eval_samples_per_second": 43.791,
|
| 5718 |
+
"eval_steps_per_second": 5.481,
|
| 5719 |
+
"step": 14250
|
| 5720 |
+
},
|
| 5721 |
+
{
|
| 5722 |
+
"epoch": 10.374477367751318,
|
| 5723 |
+
"grad_norm": 0.9042721390724182,
|
| 5724 |
+
"learning_rate": 3.611540095069592e-06,
|
| 5725 |
+
"loss": 0.5121,
|
| 5726 |
+
"mean_token_accuracy": 0.8402319389581681,
|
| 5727 |
+
"num_tokens": 314338619.0,
|
| 5728 |
+
"step": 14275
|
| 5729 |
+
},
|
| 5730 |
+
{
|
| 5731 |
+
"epoch": 10.392655880748954,
|
| 5732 |
+
"grad_norm": 0.9073200225830078,
|
| 5733 |
+
"learning_rate": 3.5324714634285796e-06,
|
| 5734 |
+
"loss": 0.5095,
|
| 5735 |
+
"mean_token_accuracy": 0.8411319550871849,
|
| 5736 |
+
"num_tokens": 314874371.0,
|
| 5737 |
+
"step": 14300
|
| 5738 |
+
},
|
| 5739 |
+
{
|
| 5740 |
+
"epoch": 10.410834393746592,
|
| 5741 |
+
"grad_norm": 0.8187711238861084,
|
| 5742 |
+
"learning_rate": 3.454237977486483e-06,
|
| 5743 |
+
"loss": 0.5051,
|
| 5744 |
+
"mean_token_accuracy": 0.8423356208205223,
|
| 5745 |
+
"num_tokens": 315434419.0,
|
| 5746 |
+
"step": 14325
|
| 5747 |
+
},
|
| 5748 |
+
{
|
| 5749 |
+
"epoch": 10.429012906744228,
|
| 5750 |
+
"grad_norm": 0.8220618963241577,
|
| 5751 |
+
"learning_rate": 3.3768414288744268e-06,
|
| 5752 |
+
"loss": 0.5118,
|
| 5753 |
+
"mean_token_accuracy": 0.8405367460846901,
|
| 5754 |
+
"num_tokens": 315967309.0,
|
| 5755 |
+
"step": 14350
|
| 5756 |
+
},
|
| 5757 |
+
{
|
| 5758 |
+
"epoch": 10.447191419741864,
|
| 5759 |
+
"grad_norm": 0.9530115723609924,
|
| 5760 |
+
"learning_rate": 3.3002835900567677e-06,
|
| 5761 |
+
"loss": 0.5121,
|
| 5762 |
+
"mean_token_accuracy": 0.8401629340648651,
|
| 5763 |
+
"num_tokens": 316508469.0,
|
| 5764 |
+
"step": 14375
|
| 5765 |
+
},
|
| 5766 |
+
{
|
| 5767 |
+
"epoch": 10.465369932739502,
|
| 5768 |
+
"grad_norm": 0.8760950565338135,
|
| 5769 |
+
"learning_rate": 3.224566214290521e-06,
|
| 5770 |
+
"loss": 0.5057,
|
| 5771 |
+
"mean_token_accuracy": 0.8424499598145485,
|
| 5772 |
+
"num_tokens": 317046765.0,
|
| 5773 |
+
"step": 14400
|
| 5774 |
+
},
|
| 5775 |
+
{
|
| 5776 |
+
"epoch": 10.483548445737139,
|
| 5777 |
+
"grad_norm": 0.8828684091567993,
|
| 5778 |
+
"learning_rate": 3.1496910355851785e-06,
|
| 5779 |
+
"loss": 0.509,
|
| 5780 |
+
"mean_token_accuracy": 0.841154874265194,
|
| 5781 |
+
"num_tokens": 317596305.0,
|
| 5782 |
+
"step": 14425
|
| 5783 |
+
},
|
| 5784 |
+
{
|
| 5785 |
+
"epoch": 10.501726958734775,
|
| 5786 |
+
"grad_norm": 0.7962938547134399,
|
| 5787 |
+
"learning_rate": 3.0756597686630064e-06,
|
| 5788 |
+
"loss": 0.5171,
|
| 5789 |
+
"mean_token_accuracy": 0.8385607668757439,
|
| 5790 |
+
"num_tokens": 318163982.0,
|
| 5791 |
+
"step": 14450
|
| 5792 |
+
},
|
| 5793 |
+
{
|
| 5794 |
+
"epoch": 10.519905471732413,
|
| 5795 |
+
"grad_norm": 0.83053058385849,
|
| 5796 |
+
"learning_rate": 3.0024741089197975e-06,
|
| 5797 |
+
"loss": 0.508,
|
| 5798 |
+
"mean_token_accuracy": 0.8415687373280525,
|
| 5799 |
+
"num_tokens": 318707187.0,
|
| 5800 |
+
"step": 14475
|
| 5801 |
+
},
|
| 5802 |
+
{
|
| 5803 |
+
"epoch": 10.538083984730049,
|
| 5804 |
+
"grad_norm": 0.8857102394104004,
|
| 5805 |
+
"learning_rate": 2.9301357323860168e-06,
|
| 5806 |
+
"loss": 0.5138,
|
| 5807 |
+
"mean_token_accuracy": 0.839360601902008,
|
| 5808 |
+
"num_tokens": 319249758.0,
|
| 5809 |
+
"step": 14500
|
| 5810 |
+
},
|
| 5811 |
+
{
|
| 5812 |
+
"epoch": 10.538083984730049,
|
| 5813 |
+
"eval_loss": 0.5384897589683533,
|
| 5814 |
+
"eval_mean_token_accuracy": 0.8324334327301948,
|
| 5815 |
+
"eval_num_tokens": 319249758.0,
|
| 5816 |
+
"eval_runtime": 110.9365,
|
| 5817 |
+
"eval_samples_per_second": 44.079,
|
| 5818 |
+
"eval_steps_per_second": 5.517,
|
| 5819 |
+
"step": 14500
|
| 5820 |
}
|
| 5821 |
],
|
| 5822 |
"logging_steps": 25,
|
|
|
|
| 5836 |
"attributes": {}
|
| 5837 |
}
|
| 5838 |
},
|
| 5839 |
+
"total_flos": 8.05384191213568e+17,
|
| 5840 |
"train_batch_size": 4,
|
| 5841 |
"trial_name": null,
|
| 5842 |
"trial_params": null
|