aghatage commited on
Commit
52ac596
·
verified ·
1 Parent(s): 859f035

Training in progress, step 14500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8aeb85e50392772e4b771ed01067db7dd33a6869f84c04ae7432a0dd055a0f19
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3576f2655b1ba80d212588b793a4ccc62cae448fb8536ce80c2cb8519f9e8da
3
  size 12017472
last-checkpoint/global_step14500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb8ddbbfe6677bfeb3dea29b30df97965b929938a7c03ad9eacba0e52ef12377
3
+ size 71982309
last-checkpoint/global_step14500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c8de513350d3a396702450256a3434f4f6d8424161c0019906936c1e1f1caa3
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step14000
 
1
+ global_step14500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f95af1290403efd8633702ca95f724b8eeb1c11b90d76b5a45554aca28009c5
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e877aa0d3a3d9a4fe852642f23daa221d76931700b5fdfe8ba4090a8a19bcbbb
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 13750,
3
- "best_metric": 0.5387488603591919,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-13500",
5
- "epoch": 10.174513724777313,
6
  "eval_steps": 250,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5617,6 +5617,206 @@
5617
  "eval_samples_per_second": 43.639,
5618
  "eval_steps_per_second": 5.462,
5619
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5620
  }
5621
  ],
5622
  "logging_steps": 25,
@@ -5636,7 +5836,7 @@
5636
  "attributes": {}
5637
  }
5638
  },
5639
- "total_flos": 7.776595356229304e+17,
5640
  "train_batch_size": 4,
5641
  "trial_name": null,
5642
  "trial_params": null
 
1
  {
2
+ "best_global_step": 14500,
3
+ "best_metric": 0.5384897589683533,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-14500",
5
+ "epoch": 10.538083984730049,
6
  "eval_steps": 250,
7
+ "global_step": 14500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5617
  "eval_samples_per_second": 43.639,
5618
  "eval_steps_per_second": 5.462,
5619
  "step": 14000
5620
+ },
5621
+ {
5622
+ "epoch": 10.19269223777495,
5623
+ "grad_norm": 0.8508243560791016,
5624
+ "learning_rate": 4.4477516452882655e-06,
5625
+ "loss": 0.5064,
5626
+ "mean_token_accuracy": 0.8425439709424972,
5627
+ "num_tokens": 308806902.0,
5628
+ "step": 14025
5629
+ },
5630
+ {
5631
+ "epoch": 10.210870750772587,
5632
+ "grad_norm": 0.8245001435279846,
5633
+ "learning_rate": 4.360434283160126e-06,
5634
+ "loss": 0.5089,
5635
+ "mean_token_accuracy": 0.8431083789467811,
5636
+ "num_tokens": 309352182.0,
5637
+ "step": 14050
5638
+ },
5639
+ {
5640
+ "epoch": 10.229049263770223,
5641
+ "grad_norm": 0.8090792298316956,
5642
+ "learning_rate": 4.273933105490162e-06,
5643
+ "loss": 0.5123,
5644
+ "mean_token_accuracy": 0.8400397875905037,
5645
+ "num_tokens": 309919307.0,
5646
+ "step": 14075
5647
+ },
5648
+ {
5649
+ "epoch": 10.247227776767861,
5650
+ "grad_norm": 0.9191139936447144,
5651
+ "learning_rate": 4.188250093248547e-06,
5652
+ "loss": 0.5021,
5653
+ "mean_token_accuracy": 0.8438076037168503,
5654
+ "num_tokens": 310468181.0,
5655
+ "step": 14100
5656
+ },
5657
+ {
5658
+ "epoch": 10.265406289765497,
5659
+ "grad_norm": 0.8430826663970947,
5660
+ "learning_rate": 4.103387208668594e-06,
5661
+ "loss": 0.5103,
5662
+ "mean_token_accuracy": 0.8410224625468254,
5663
+ "num_tokens": 311012563.0,
5664
+ "step": 14125
5665
+ },
5666
+ {
5667
+ "epoch": 10.283584802763134,
5668
+ "grad_norm": 0.8337134122848511,
5669
+ "learning_rate": 4.019346395201793e-06,
5670
+ "loss": 0.5059,
5671
+ "mean_token_accuracy": 0.8416058418154716,
5672
+ "num_tokens": 311558333.0,
5673
+ "step": 14150
5674
+ },
5675
+ {
5676
+ "epoch": 10.301763315760772,
5677
+ "grad_norm": 0.8520947694778442,
5678
+ "learning_rate": 3.936129577473344e-06,
5679
+ "loss": 0.5117,
5680
+ "mean_token_accuracy": 0.839869918525219,
5681
+ "num_tokens": 312128294.0,
5682
+ "step": 14175
5683
+ },
5684
+ {
5685
+ "epoch": 10.319941828758408,
5686
+ "grad_norm": 0.8563548922538757,
5687
+ "learning_rate": 3.853738661238042e-06,
5688
+ "loss": 0.5162,
5689
+ "mean_token_accuracy": 0.8394653937220573,
5690
+ "num_tokens": 312689462.0,
5691
+ "step": 14200
5692
+ },
5693
+ {
5694
+ "epoch": 10.338120341756044,
5695
+ "grad_norm": 0.8299646377563477,
5696
+ "learning_rate": 3.7721755333366326e-06,
5697
+ "loss": 0.508,
5698
+ "mean_token_accuracy": 0.8402037498354912,
5699
+ "num_tokens": 313254544.0,
5700
+ "step": 14225
5701
+ },
5702
+ {
5703
+ "epoch": 10.356298854753682,
5704
+ "grad_norm": 0.865742027759552,
5705
+ "learning_rate": 3.691442061652657e-06,
5706
+ "loss": 0.5106,
5707
+ "mean_token_accuracy": 0.8408624231815338,
5708
+ "num_tokens": 313792753.0,
5709
+ "step": 14250
5710
+ },
5711
+ {
5712
+ "epoch": 10.356298854753682,
5713
+ "eval_loss": 0.5386558175086975,
5714
+ "eval_mean_token_accuracy": 0.8323602951040455,
5715
+ "eval_num_tokens": 313792753.0,
5716
+ "eval_runtime": 111.6679,
5717
+ "eval_samples_per_second": 43.791,
5718
+ "eval_steps_per_second": 5.481,
5719
+ "step": 14250
5720
+ },
5721
+ {
5722
+ "epoch": 10.374477367751318,
5723
+ "grad_norm": 0.9042721390724182,
5724
+ "learning_rate": 3.611540095069592e-06,
5725
+ "loss": 0.5121,
5726
+ "mean_token_accuracy": 0.8402319389581681,
5727
+ "num_tokens": 314338619.0,
5728
+ "step": 14275
5729
+ },
5730
+ {
5731
+ "epoch": 10.392655880748954,
5732
+ "grad_norm": 0.9073200225830078,
5733
+ "learning_rate": 3.5324714634285796e-06,
5734
+ "loss": 0.5095,
5735
+ "mean_token_accuracy": 0.8411319550871849,
5736
+ "num_tokens": 314874371.0,
5737
+ "step": 14300
5738
+ },
5739
+ {
5740
+ "epoch": 10.410834393746592,
5741
+ "grad_norm": 0.8187711238861084,
5742
+ "learning_rate": 3.454237977486483e-06,
5743
+ "loss": 0.5051,
5744
+ "mean_token_accuracy": 0.8423356208205223,
5745
+ "num_tokens": 315434419.0,
5746
+ "step": 14325
5747
+ },
5748
+ {
5749
+ "epoch": 10.429012906744228,
5750
+ "grad_norm": 0.8220618963241577,
5751
+ "learning_rate": 3.3768414288744268e-06,
5752
+ "loss": 0.5118,
5753
+ "mean_token_accuracy": 0.8405367460846901,
5754
+ "num_tokens": 315967309.0,
5755
+ "step": 14350
5756
+ },
5757
+ {
5758
+ "epoch": 10.447191419741864,
5759
+ "grad_norm": 0.9530115723609924,
5760
+ "learning_rate": 3.3002835900567677e-06,
5761
+ "loss": 0.5121,
5762
+ "mean_token_accuracy": 0.8401629340648651,
5763
+ "num_tokens": 316508469.0,
5764
+ "step": 14375
5765
+ },
5766
+ {
5767
+ "epoch": 10.465369932739502,
5768
+ "grad_norm": 0.8760950565338135,
5769
+ "learning_rate": 3.224566214290521e-06,
5770
+ "loss": 0.5057,
5771
+ "mean_token_accuracy": 0.8424499598145485,
5772
+ "num_tokens": 317046765.0,
5773
+ "step": 14400
5774
+ },
5775
+ {
5776
+ "epoch": 10.483548445737139,
5777
+ "grad_norm": 0.8828684091567993,
5778
+ "learning_rate": 3.1496910355851785e-06,
5779
+ "loss": 0.509,
5780
+ "mean_token_accuracy": 0.841154874265194,
5781
+ "num_tokens": 317596305.0,
5782
+ "step": 14425
5783
+ },
5784
+ {
5785
+ "epoch": 10.501726958734775,
5786
+ "grad_norm": 0.7962938547134399,
5787
+ "learning_rate": 3.0756597686630064e-06,
5788
+ "loss": 0.5171,
5789
+ "mean_token_accuracy": 0.8385607668757439,
5790
+ "num_tokens": 318163982.0,
5791
+ "step": 14450
5792
+ },
5793
+ {
5794
+ "epoch": 10.519905471732413,
5795
+ "grad_norm": 0.83053058385849,
5796
+ "learning_rate": 3.0024741089197975e-06,
5797
+ "loss": 0.508,
5798
+ "mean_token_accuracy": 0.8415687373280525,
5799
+ "num_tokens": 318707187.0,
5800
+ "step": 14475
5801
+ },
5802
+ {
5803
+ "epoch": 10.538083984730049,
5804
+ "grad_norm": 0.8857102394104004,
5805
+ "learning_rate": 2.9301357323860168e-06,
5806
+ "loss": 0.5138,
5807
+ "mean_token_accuracy": 0.839360601902008,
5808
+ "num_tokens": 319249758.0,
5809
+ "step": 14500
5810
+ },
5811
+ {
5812
+ "epoch": 10.538083984730049,
5813
+ "eval_loss": 0.5384897589683533,
5814
+ "eval_mean_token_accuracy": 0.8324334327301948,
5815
+ "eval_num_tokens": 319249758.0,
5816
+ "eval_runtime": 110.9365,
5817
+ "eval_samples_per_second": 44.079,
5818
+ "eval_steps_per_second": 5.517,
5819
+ "step": 14500
5820
  }
5821
  ],
5822
  "logging_steps": 25,
 
5836
  "attributes": {}
5837
  }
5838
  },
5839
+ "total_flos": 8.05384191213568e+17,
5840
  "train_batch_size": 4,
5841
  "trial_name": null,
5842
  "trial_params": null