aghatage commited on
Commit
e4f636e
·
verified ·
1 Parent(s): 7e76dff

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d739a46be07afc08058bcee6abb1772a84e044deaf39817666f3049bcf653c23
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d553c2e399ec3b829abae04cec3b1f8d3912f3fbed3f20293d8080f079fd2384
3
  size 12017472
last-checkpoint/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca0684aa48abff7305a3993c60350cc16307d590903d35bab2763d93196f7ec
3
+ size 71982309
last-checkpoint/global_step12000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:458a0f2126d2e651f4b6c3bae0c3c265ebcf9adcd4b370723bb56b05fd8a7734
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step11500
 
1
+ global_step12000
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d17e6956d333adf450e550fb2bbfe82bc47be67acb5350845a13faa81c890b40
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfa8ca0fa51002cbd049096e980f791791ccf3dd111ae5e6ddeefe7a21364f2d
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 11500,
3
- "best_metric": 0.544745683670044,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-11500",
5
- "epoch": 8.357753135793493,
6
  "eval_steps": 250,
7
- "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4617,6 +4617,206 @@
4617
  "eval_samples_per_second": 42.862,
4618
  "eval_steps_per_second": 5.364,
4619
  "step": 11500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4620
  }
4621
  ],
4622
  "logging_steps": 25,
@@ -4636,7 +4836,7 @@
4636
  "attributes": {}
4637
  }
4638
  },
4639
- "total_flos": 6.387220694035333e+17,
4640
  "train_batch_size": 4,
4641
  "trial_name": null,
4642
  "trial_params": null
 
1
  {
2
+ "best_global_step": 12000,
3
+ "best_metric": 0.5422044396400452,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-12000",
5
+ "epoch": 8.721323395746229,
6
  "eval_steps": 250,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4617
  "eval_samples_per_second": 42.862,
4618
  "eval_steps_per_second": 5.364,
4619
  "step": 11500
4620
+ },
4621
+ {
4622
+ "epoch": 8.375931648791129,
4623
+ "grad_norm": 0.799972414970398,
4624
+ "learning_rate": 1.6883042740215607e-05,
4625
+ "loss": 0.5202,
4626
+ "mean_token_accuracy": 0.8381973811984063,
4627
+ "num_tokens": 253842951.0,
4628
+ "step": 11525
4629
+ },
4630
+ {
4631
+ "epoch": 8.394110161788765,
4632
+ "grad_norm": 0.8211126327514648,
4633
+ "learning_rate": 1.6727091589575087e-05,
4634
+ "loss": 0.5242,
4635
+ "mean_token_accuracy": 0.8364318865537643,
4636
+ "num_tokens": 254394355.0,
4637
+ "step": 11550
4638
+ },
4639
+ {
4640
+ "epoch": 8.412288674786403,
4641
+ "grad_norm": 0.8928632140159607,
4642
+ "learning_rate": 1.6571673413614786e-05,
4643
+ "loss": 0.5203,
4644
+ "mean_token_accuracy": 0.8383208379149437,
4645
+ "num_tokens": 254928725.0,
4646
+ "step": 11575
4647
+ },
4648
+ {
4649
+ "epoch": 8.43046718778404,
4650
+ "grad_norm": 0.8527329564094543,
4651
+ "learning_rate": 1.641679177157841e-05,
4652
+ "loss": 0.5275,
4653
+ "mean_token_accuracy": 0.8369225415587426,
4654
+ "num_tokens": 255482197.0,
4655
+ "step": 11600
4656
+ },
4657
+ {
4658
+ "epoch": 8.448645700781675,
4659
+ "grad_norm": 0.8391927480697632,
4660
+ "learning_rate": 1.626245021042244e-05,
4661
+ "loss": 0.5318,
4662
+ "mean_token_accuracy": 0.8345513901114464,
4663
+ "num_tokens": 256040695.0,
4664
+ "step": 11625
4665
+ },
4666
+ {
4667
+ "epoch": 8.466824213779313,
4668
+ "grad_norm": 0.816584587097168,
4669
+ "learning_rate": 1.6108652264734953e-05,
4670
+ "loss": 0.5231,
4671
+ "mean_token_accuracy": 0.8369111514091492,
4672
+ "num_tokens": 256592914.0,
4673
+ "step": 11650
4674
+ },
4675
+ {
4676
+ "epoch": 8.48500272677695,
4677
+ "grad_norm": 0.8552756309509277,
4678
+ "learning_rate": 1.5955401456654614e-05,
4679
+ "loss": 0.5156,
4680
+ "mean_token_accuracy": 0.8394366270303726,
4681
+ "num_tokens": 257129982.0,
4682
+ "step": 11675
4683
+ },
4684
+ {
4685
+ "epoch": 8.503181239774586,
4686
+ "grad_norm": 0.9166038632392883,
4687
+ "learning_rate": 1.5802701295790058e-05,
4688
+ "loss": 0.5289,
4689
+ "mean_token_accuracy": 0.8356350338459015,
4690
+ "num_tokens": 257674901.0,
4691
+ "step": 11700
4692
+ },
4693
+ {
4694
+ "epoch": 8.521359752772224,
4695
+ "grad_norm": 0.8325772285461426,
4696
+ "learning_rate": 1.565055527913954e-05,
4697
+ "loss": 0.5295,
4698
+ "mean_token_accuracy": 0.8354543587565422,
4699
+ "num_tokens": 258229868.0,
4700
+ "step": 11725
4701
+ },
4702
+ {
4703
+ "epoch": 8.53953826576986,
4704
+ "grad_norm": 0.8623970150947571,
4705
+ "learning_rate": 1.5498966891010768e-05,
4706
+ "loss": 0.5158,
4707
+ "mean_token_accuracy": 0.8386522510647774,
4708
+ "num_tokens": 258782034.0,
4709
+ "step": 11750
4710
+ },
4711
+ {
4712
+ "epoch": 8.53953826576986,
4713
+ "eval_loss": 0.5427566170692444,
4714
+ "eval_mean_token_accuracy": 0.8309165983418234,
4715
+ "eval_num_tokens": 258782034.0,
4716
+ "eval_runtime": 114.0537,
4717
+ "eval_samples_per_second": 42.875,
4718
+ "eval_steps_per_second": 5.366,
4719
+ "step": 11750
4720
+ },
4721
+ {
4722
+ "epoch": 8.557716778767496,
4723
+ "grad_norm": 0.7980916500091553,
4724
+ "learning_rate": 1.5347939602941168e-05,
4725
+ "loss": 0.5201,
4726
+ "mean_token_accuracy": 0.8383438029885292,
4727
+ "num_tokens": 259333447.0,
4728
+ "step": 11775
4729
+ },
4730
+ {
4731
+ "epoch": 8.575895291765134,
4732
+ "grad_norm": 0.8518940806388855,
4733
+ "learning_rate": 1.5197476873618385e-05,
4734
+ "loss": 0.5208,
4735
+ "mean_token_accuracy": 0.8381571528315545,
4736
+ "num_tokens": 259880015.0,
4737
+ "step": 11800
4738
+ },
4739
+ {
4740
+ "epoch": 8.59407380476277,
4741
+ "grad_norm": 0.8770802617073059,
4742
+ "learning_rate": 1.5047582148801043e-05,
4743
+ "loss": 0.5159,
4744
+ "mean_token_accuracy": 0.8389484801888466,
4745
+ "num_tokens": 260448319.0,
4746
+ "step": 11825
4747
+ },
4748
+ {
4749
+ "epoch": 8.612252317760408,
4750
+ "grad_norm": 0.8422465324401855,
4751
+ "learning_rate": 1.489825886123987e-05,
4752
+ "loss": 0.5149,
4753
+ "mean_token_accuracy": 0.8405806881189346,
4754
+ "num_tokens": 260995823.0,
4755
+ "step": 11850
4756
+ },
4757
+ {
4758
+ "epoch": 8.630430830758044,
4759
+ "grad_norm": 0.8910753726959229,
4760
+ "learning_rate": 1.4749510430599028e-05,
4761
+ "loss": 0.5178,
4762
+ "mean_token_accuracy": 0.838139820098877,
4763
+ "num_tokens": 261537207.0,
4764
+ "step": 11875
4765
+ },
4766
+ {
4767
+ "epoch": 8.64860934375568,
4768
+ "grad_norm": 0.7846816182136536,
4769
+ "learning_rate": 1.460134026337789e-05,
4770
+ "loss": 0.518,
4771
+ "mean_token_accuracy": 0.8382885718345642,
4772
+ "num_tokens": 262095255.0,
4773
+ "step": 11900
4774
+ },
4775
+ {
4776
+ "epoch": 8.666787856753318,
4777
+ "grad_norm": 0.8034218549728394,
4778
+ "learning_rate": 1.445375175283294e-05,
4779
+ "loss": 0.5263,
4780
+ "mean_token_accuracy": 0.8366366970539093,
4781
+ "num_tokens": 262638805.0,
4782
+ "step": 11925
4783
+ },
4784
+ {
4785
+ "epoch": 8.684966369750954,
4786
+ "grad_norm": 0.8177446126937866,
4787
+ "learning_rate": 1.4306748278900102e-05,
4788
+ "loss": 0.5208,
4789
+ "mean_token_accuracy": 0.8381500113010406,
4790
+ "num_tokens": 263176118.0,
4791
+ "step": 11950
4792
+ },
4793
+ {
4794
+ "epoch": 8.70314488274859,
4795
+ "grad_norm": 0.890275239944458,
4796
+ "learning_rate": 1.4160333208117326e-05,
4797
+ "loss": 0.5181,
4798
+ "mean_token_accuracy": 0.8382448080182076,
4799
+ "num_tokens": 263729198.0,
4800
+ "step": 11975
4801
+ },
4802
+ {
4803
+ "epoch": 8.721323395746229,
4804
+ "grad_norm": 0.850255012512207,
4805
+ "learning_rate": 1.4014509893547503e-05,
4806
+ "loss": 0.5279,
4807
+ "mean_token_accuracy": 0.835902649462223,
4808
+ "num_tokens": 264289934.0,
4809
+ "step": 12000
4810
+ },
4811
+ {
4812
+ "epoch": 8.721323395746229,
4813
+ "eval_loss": 0.5422044396400452,
4814
+ "eval_mean_token_accuracy": 0.8310892993912977,
4815
+ "eval_num_tokens": 264289934.0,
4816
+ "eval_runtime": 113.7092,
4817
+ "eval_samples_per_second": 43.004,
4818
+ "eval_steps_per_second": 5.382,
4819
+ "step": 12000
4820
  }
4821
  ],
4822
  "logging_steps": 25,
 
4836
  "attributes": {}
4837
  }
4838
  },
4839
+ "total_flos": 6.664963569651548e+17,
4840
  "train_batch_size": 4,
4841
  "trial_name": null,
4842
  "trial_params": null