aghatage commited on
Commit
96aecff
·
verified ·
1 Parent(s): 8cd6271

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e3e9ee8a611e7a57dccff25563a008747ed15810194baa91980ef853c11a0a7
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b160330a699e6391aabdd6c326d1ca2154af597460c4109b821f3a27a3de51f
3
  size 12017472
last-checkpoint/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:075920d4fe69625abfe8ade60f18025bd5df07d45e21e94c515e87ef9a80ae16
3
+ size 71982309
last-checkpoint/global_step2000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edecabba5f405ae2044dcff16f3f5c2a1215ca2313484c1f864808f888ecf949
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step1500
 
1
+ global_step2000
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27afcafd6ed5692d8873208ba0cf57e46a0701e5eb0aa08cd9750d1e2b88cb5d
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d92ac44cc5eabc6a5deb9b9de409e8c10d46ff0d44b4e3a5b61bcb9e4a0349fe
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 1500,
3
- "best_metric": 0.6834071278572083,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-1500",
5
- "epoch": 1.0901654244682786,
6
  "eval_steps": 250,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -617,6 +617,206 @@
617
  "eval_samples_per_second": 43.157,
618
  "eval_steps_per_second": 5.401,
619
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  }
621
  ],
622
  "logging_steps": 25,
@@ -636,7 +836,7 @@
636
  "attributes": {}
637
  }
638
  },
639
- "total_flos": 8.33876308084654e+16,
640
  "train_batch_size": 4,
641
  "trial_name": null,
642
  "trial_params": null
 
1
  {
2
+ "best_global_step": 2000,
3
+ "best_metric": 0.6596384644508362,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-2000",
5
+ "epoch": 1.4537356844210143,
6
  "eval_steps": 250,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
617
  "eval_samples_per_second": 43.157,
618
  "eval_steps_per_second": 5.401,
619
  "step": 1500
620
+ },
621
+ {
622
+ "epoch": 1.1083439374659152,
623
+ "grad_norm": 0.988451361656189,
624
+ "learning_rate": 7.852108931102753e-05,
625
+ "loss": 0.6679,
626
+ "mean_token_accuracy": 0.7947028204798698,
627
+ "num_tokens": 33602251.0,
628
+ "step": 1525
629
+ },
630
+ {
631
+ "epoch": 1.126522450463552,
632
+ "grad_norm": 1.0758918523788452,
633
+ "learning_rate": 7.846907896999776e-05,
634
+ "loss": 0.6738,
635
+ "mean_token_accuracy": 0.7931618624925614,
636
+ "num_tokens": 34152752.0,
637
+ "step": 1550
638
+ },
639
+ {
640
+ "epoch": 1.1447009634611889,
641
+ "grad_norm": 0.9699676036834717,
642
+ "learning_rate": 7.841618764560739e-05,
643
+ "loss": 0.6814,
644
+ "mean_token_accuracy": 0.7907909327745437,
645
+ "num_tokens": 34714148.0,
646
+ "step": 1575
647
+ },
648
+ {
649
+ "epoch": 1.1628794764588257,
650
+ "grad_norm": 0.9129726886749268,
651
+ "learning_rate": 7.836241654912474e-05,
652
+ "loss": 0.6799,
653
+ "mean_token_accuracy": 0.7906370875239372,
654
+ "num_tokens": 35275146.0,
655
+ "step": 1600
656
+ },
657
+ {
658
+ "epoch": 1.1810579894564626,
659
+ "grad_norm": 0.9198676347732544,
660
+ "learning_rate": 7.830776691196585e-05,
661
+ "loss": 0.6699,
662
+ "mean_token_accuracy": 0.7948868894577026,
663
+ "num_tokens": 35821013.0,
664
+ "step": 1625
665
+ },
666
+ {
667
+ "epoch": 1.1992365024540992,
668
+ "grad_norm": 0.9604835510253906,
669
+ "learning_rate": 7.825223998566632e-05,
670
+ "loss": 0.6855,
671
+ "mean_token_accuracy": 0.790110493004322,
672
+ "num_tokens": 36367326.0,
673
+ "step": 1650
674
+ },
675
+ {
676
+ "epoch": 1.217415015451736,
677
+ "grad_norm": 0.9292364716529846,
678
+ "learning_rate": 7.819583704185258e-05,
679
+ "loss": 0.6665,
680
+ "mean_token_accuracy": 0.7950288987159729,
681
+ "num_tokens": 36904100.0,
682
+ "step": 1675
683
+ },
684
+ {
685
+ "epoch": 1.2355935284493729,
686
+ "grad_norm": 0.9496335387229919,
687
+ "learning_rate": 7.813855937221283e-05,
688
+ "loss": 0.6793,
689
+ "mean_token_accuracy": 0.7916408607363701,
690
+ "num_tokens": 37451860.0,
691
+ "step": 1700
692
+ },
693
+ {
694
+ "epoch": 1.2537720414470097,
695
+ "grad_norm": 0.9605362415313721,
696
+ "learning_rate": 7.808040828846742e-05,
697
+ "loss": 0.6703,
698
+ "mean_token_accuracy": 0.7932550877332687,
699
+ "num_tokens": 38012329.0,
700
+ "step": 1725
701
+ },
702
+ {
703
+ "epoch": 1.2719505544446466,
704
+ "grad_norm": 0.9731937646865845,
705
+ "learning_rate": 7.80213851223388e-05,
706
+ "loss": 0.6631,
707
+ "mean_token_accuracy": 0.7963846024870872,
708
+ "num_tokens": 38545490.0,
709
+ "step": 1750
710
+ },
711
+ {
712
+ "epoch": 1.2719505544446466,
713
+ "eval_loss": 0.6696051359176636,
714
+ "eval_mean_token_accuracy": 0.792529649204678,
715
+ "eval_num_tokens": 38545490.0,
716
+ "eval_runtime": 114.5795,
717
+ "eval_samples_per_second": 42.678,
718
+ "eval_steps_per_second": 5.341,
719
+ "step": 1750
720
+ },
721
+ {
722
+ "epoch": 1.2901290674422832,
723
+ "grad_norm": 0.9220979809761047,
724
+ "learning_rate": 7.796149122552112e-05,
725
+ "loss": 0.6663,
726
+ "mean_token_accuracy": 0.7952693116664886,
727
+ "num_tokens": 39090734.0,
728
+ "step": 1775
729
+ },
730
+ {
731
+ "epoch": 1.30830758043992,
732
+ "grad_norm": 0.883160412311554,
733
+ "learning_rate": 7.790072796964914e-05,
734
+ "loss": 0.6645,
735
+ "mean_token_accuracy": 0.796334767639637,
736
+ "num_tokens": 39651191.0,
737
+ "step": 1800
738
+ },
739
+ {
740
+ "epoch": 1.3264860934375569,
741
+ "grad_norm": 0.940244734287262,
742
+ "learning_rate": 7.783909674626689e-05,
743
+ "loss": 0.6696,
744
+ "mean_token_accuracy": 0.794621022939682,
745
+ "num_tokens": 40201262.0,
746
+ "step": 1825
747
+ },
748
+ {
749
+ "epoch": 1.3446646064351935,
750
+ "grad_norm": 0.9481264352798462,
751
+ "learning_rate": 7.77765989667958e-05,
752
+ "loss": 0.6594,
753
+ "mean_token_accuracy": 0.797239051759243,
754
+ "num_tokens": 40729247.0,
755
+ "step": 1850
756
+ },
757
+ {
758
+ "epoch": 1.3628431194328303,
759
+ "grad_norm": 0.8973710536956787,
760
+ "learning_rate": 7.771323606250233e-05,
761
+ "loss": 0.6729,
762
+ "mean_token_accuracy": 0.7936947122216225,
763
+ "num_tokens": 41294203.0,
764
+ "step": 1875
765
+ },
766
+ {
767
+ "epoch": 1.3810216324304672,
768
+ "grad_norm": 0.9314188361167908,
769
+ "learning_rate": 7.764900948446533e-05,
770
+ "loss": 0.6673,
771
+ "mean_token_accuracy": 0.7956089550256729,
772
+ "num_tokens": 41844911.0,
773
+ "step": 1900
774
+ },
775
+ {
776
+ "epoch": 1.399200145428104,
777
+ "grad_norm": 0.9455300569534302,
778
+ "learning_rate": 7.758392070354259e-05,
779
+ "loss": 0.6705,
780
+ "mean_token_accuracy": 0.7935251343250275,
781
+ "num_tokens": 42404008.0,
782
+ "step": 1925
783
+ },
784
+ {
785
+ "epoch": 1.4173786584257408,
786
+ "grad_norm": 0.9419692754745483,
787
+ "learning_rate": 7.751797121033737e-05,
788
+ "loss": 0.6595,
789
+ "mean_token_accuracy": 0.7975886738300324,
790
+ "num_tokens": 42936579.0,
791
+ "step": 1950
792
+ },
793
+ {
794
+ "epoch": 1.4355571714233775,
795
+ "grad_norm": 0.8725437521934509,
796
+ "learning_rate": 7.745116251516407e-05,
797
+ "loss": 0.6603,
798
+ "mean_token_accuracy": 0.795488908290863,
799
+ "num_tokens": 43483670.0,
800
+ "step": 1975
801
+ },
802
+ {
803
+ "epoch": 1.4537356844210143,
804
+ "grad_norm": 0.9226874113082886,
805
+ "learning_rate": 7.738349614801387e-05,
806
+ "loss": 0.6642,
807
+ "mean_token_accuracy": 0.7964420530200005,
808
+ "num_tokens": 44044259.0,
809
+ "step": 2000
810
+ },
811
+ {
812
+ "epoch": 1.4537356844210143,
813
+ "eval_loss": 0.6596384644508362,
814
+ "eval_mean_token_accuracy": 0.7954148624847138,
815
+ "eval_num_tokens": 44044259.0,
816
+ "eval_runtime": 112.7192,
817
+ "eval_samples_per_second": 43.382,
818
+ "eval_steps_per_second": 5.429,
819
+ "step": 2000
820
  }
821
  ],
822
  "logging_steps": 25,
 
836
  "attributes": {}
837
  }
838
  },
839
+ "total_flos": 1.111102656169902e+17,
840
  "train_batch_size": 4,
841
  "trial_name": null,
842
  "trial_params": null