Azrail commited on
Commit
3a055c3
·
verified ·
1 Parent(s): 8b170fe

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e63c3db29c1618638ad881936caf0567d6bf237c2d8358081d7324f323992b88
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:205a3991d60a5c28fb1c39f7dbf7a515c4fd4b6685d8240efe51c017acfa36b1
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5d06df021b0676b1a33468e46c419e946ec39b40bd40d3bc089bc612bb03c6
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d50c4bfe654d987803fd6a6960587e83c15bda23187fdd2e49b310d524ae5ac
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:841a17a497b009366c5a93d5dd6be560b5debec016825b3dd528f17c802bd947
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2fab474267dfdb6f9f735fba3b6956eaa8395da984c318144fab7c0aefa914f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5361224ea55f23c97a6ce6c66443aa545b9c0f839415b7a0921d0d516508616d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1982f4530393fe0c872036d1fa81199d7b6bd002acf4619fae87ec9de696f64d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7242331427191937,
6
  "eval_steps": 500,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -662,11 +662,229 @@
662
  "eval_steps_per_second": 21.312,
663
  "num_input_tokens_seen": 1448897616,
664
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  }
666
  ],
667
  "logging_steps": 50,
668
  "max_steps": 16568,
669
- "num_input_tokens_seen": 1448897616,
670
  "num_train_epochs": 4,
671
  "save_steps": 1000,
672
  "stateful_callbacks": {
@@ -681,7 +899,7 @@
681
  "attributes": {}
682
  }
683
  },
684
- "total_flos": 3.8759425352073216e+17,
685
  "train_batch_size": 16,
686
  "trial_name": null,
687
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9656441902922582,
6
  "eval_steps": 500,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
662
  "eval_steps_per_second": 21.312,
663
  "num_input_tokens_seen": 1448897616,
664
  "step": 3000
665
+ },
666
+ {
667
+ "epoch": 0.7363036950978469,
668
+ "grad_norm": 0.298828125,
669
+ "learning_rate": 4.6016898008449e-05,
670
+ "loss": 2.2866,
671
+ "mean_token_accuracy": 0.535948946569115,
672
+ "num_input_tokens_seen": 1472853872,
673
+ "num_tokens": 620644723.0,
674
+ "step": 3050
675
+ },
676
+ {
677
+ "epoch": 0.7483742474765002,
678
+ "grad_norm": 0.296875,
679
+ "learning_rate": 4.6771273385636696e-05,
680
+ "loss": 2.2713,
681
+ "mean_token_accuracy": 0.5381996771320701,
682
+ "num_input_tokens_seen": 1497112768,
683
+ "num_tokens": 630831881.0,
684
+ "step": 3100
685
+ },
686
+ {
687
+ "epoch": 0.7604447998551533,
688
+ "grad_norm": 0.318359375,
689
+ "learning_rate": 4.752564876282438e-05,
690
+ "loss": 2.2659,
691
+ "mean_token_accuracy": 0.5379517000168562,
692
+ "num_input_tokens_seen": 1521326656,
693
+ "num_tokens": 640973141.0,
694
+ "step": 3150
695
+ },
696
+ {
697
+ "epoch": 0.7725153522338066,
698
+ "grad_norm": 0.27734375,
699
+ "learning_rate": 4.828002414001207e-05,
700
+ "loss": 2.2534,
701
+ "mean_token_accuracy": 0.5404072028771043,
702
+ "num_input_tokens_seen": 1545505712,
703
+ "num_tokens": 651205945.0,
704
+ "step": 3200
705
+ },
706
+ {
707
+ "epoch": 0.7845859046124598,
708
+ "grad_norm": 0.267578125,
709
+ "learning_rate": 4.903439951719976e-05,
710
+ "loss": 2.2662,
711
+ "mean_token_accuracy": 0.5395008590817452,
712
+ "num_input_tokens_seen": 1569597408,
713
+ "num_tokens": 661422296.0,
714
+ "step": 3250
715
+ },
716
+ {
717
+ "epoch": 0.7966564569911131,
718
+ "grad_norm": 0.306640625,
719
+ "learning_rate": 4.978877489438745e-05,
720
+ "loss": 2.2429,
721
+ "mean_token_accuracy": 0.5423036898300052,
722
+ "num_input_tokens_seen": 1593795440,
723
+ "num_tokens": 671551427.0,
724
+ "step": 3300
725
+ },
726
+ {
727
+ "epoch": 0.8087270093697663,
728
+ "grad_norm": 0.29296875,
729
+ "learning_rate": 4.9864191942055236e-05,
730
+ "loss": 2.234,
731
+ "mean_token_accuracy": 0.5444167210906744,
732
+ "num_input_tokens_seen": 1618048688,
733
+ "num_tokens": 681790219.0,
734
+ "step": 3350
735
+ },
736
+ {
737
+ "epoch": 0.8207975617484196,
738
+ "grad_norm": 0.306640625,
739
+ "learning_rate": 4.967556963935416e-05,
740
+ "loss": 2.2466,
741
+ "mean_token_accuracy": 0.5417763916775584,
742
+ "num_input_tokens_seen": 1642159776,
743
+ "num_tokens": 691978554.0,
744
+ "step": 3400
745
+ },
746
+ {
747
+ "epoch": 0.8328681141270727,
748
+ "grad_norm": 0.30859375,
749
+ "learning_rate": 4.9486947336653086e-05,
750
+ "loss": 2.2358,
751
+ "mean_token_accuracy": 0.5441526301577687,
752
+ "num_input_tokens_seen": 1666410544,
753
+ "num_tokens": 702184762.0,
754
+ "step": 3450
755
+ },
756
+ {
757
+ "epoch": 0.844938666505726,
758
+ "grad_norm": 0.2578125,
759
+ "learning_rate": 4.929832503395201e-05,
760
+ "loss": 2.2274,
761
+ "num_input_tokens_seen": 1690571888,
762
+ "step": 3500
763
+ },
764
+ {
765
+ "epoch": 0.844938666505726,
766
+ "eval_loss": 2.110778331756592,
767
+ "eval_mean_token_accuracy": 0.5686311067219009,
768
+ "eval_num_tokens": 712362296.0,
769
+ "eval_runtime": 127.3714,
770
+ "eval_samples_per_second": 84.1,
771
+ "eval_steps_per_second": 21.025,
772
+ "num_input_tokens_seen": 1690571888,
773
+ "step": 3500
774
+ },
775
+ {
776
+ "epoch": 0.8570092188843792,
777
+ "grad_norm": 0.283203125,
778
+ "learning_rate": 4.9109702731250944e-05,
779
+ "loss": 2.2322,
780
+ "mean_token_accuracy": 0.5444289642199874,
781
+ "num_input_tokens_seen": 1714787776,
782
+ "num_tokens": 722682302.0,
783
+ "step": 3550
784
+ },
785
+ {
786
+ "epoch": 0.8690797712630324,
787
+ "grad_norm": 0.3203125,
788
+ "learning_rate": 4.8921080428549876e-05,
789
+ "loss": 2.2194,
790
+ "mean_token_accuracy": 0.5459688815101981,
791
+ "num_input_tokens_seen": 1739009552,
792
+ "num_tokens": 732823822.0,
793
+ "step": 3600
794
+ },
795
+ {
796
+ "epoch": 0.8811503236416857,
797
+ "grad_norm": 0.267578125,
798
+ "learning_rate": 4.87324581258488e-05,
799
+ "loss": 2.2139,
800
+ "mean_token_accuracy": 0.5467682545632124,
801
+ "num_input_tokens_seen": 1763209840,
802
+ "num_tokens": 743138600.0,
803
+ "step": 3650
804
+ },
805
+ {
806
+ "epoch": 0.8932208760203388,
807
+ "grad_norm": 0.328125,
808
+ "learning_rate": 4.854383582314773e-05,
809
+ "loss": 2.204,
810
+ "mean_token_accuracy": 0.5477216844260693,
811
+ "num_input_tokens_seen": 1787295680,
812
+ "num_tokens": 753284868.0,
813
+ "step": 3700
814
+ },
815
+ {
816
+ "epoch": 0.9052914283989921,
817
+ "grad_norm": 0.306640625,
818
+ "learning_rate": 4.835521352044666e-05,
819
+ "loss": 2.186,
820
+ "mean_token_accuracy": 0.5463542007282376,
821
+ "num_input_tokens_seen": 1811501840,
822
+ "num_tokens": 763533047.0,
823
+ "step": 3750
824
+ },
825
+ {
826
+ "epoch": 0.9173619807776453,
827
+ "grad_norm": 0.294921875,
828
+ "learning_rate": 4.816659121774559e-05,
829
+ "loss": 2.1705,
830
+ "mean_token_accuracy": 0.5472249809652567,
831
+ "num_input_tokens_seen": 1835579680,
832
+ "num_tokens": 773772552.0,
833
+ "step": 3800
834
+ },
835
+ {
836
+ "epoch": 0.9294325331562986,
837
+ "grad_norm": 0.2578125,
838
+ "learning_rate": 4.797796891504452e-05,
839
+ "loss": 2.1472,
840
+ "mean_token_accuracy": 0.5502070318907499,
841
+ "num_input_tokens_seen": 1859762928,
842
+ "num_tokens": 783996309.0,
843
+ "step": 3850
844
+ },
845
+ {
846
+ "epoch": 0.9415030855349518,
847
+ "grad_norm": 0.30078125,
848
+ "learning_rate": 4.778934661234345e-05,
849
+ "loss": 2.1494,
850
+ "mean_token_accuracy": 0.548779489658773,
851
+ "num_input_tokens_seen": 1883948656,
852
+ "num_tokens": 794252324.0,
853
+ "step": 3900
854
+ },
855
+ {
856
+ "epoch": 0.953573637913605,
857
+ "grad_norm": 0.29296875,
858
+ "learning_rate": 4.760072430964237e-05,
859
+ "loss": 2.1484,
860
+ "mean_token_accuracy": 0.5490481401607394,
861
+ "num_input_tokens_seen": 1908219840,
862
+ "num_tokens": 804538128.0,
863
+ "step": 3950
864
+ },
865
+ {
866
+ "epoch": 0.9656441902922582,
867
+ "grad_norm": 0.291015625,
868
+ "learning_rate": 4.7412102006941305e-05,
869
+ "loss": 2.1447,
870
+ "num_input_tokens_seen": 1932223680,
871
+ "step": 4000
872
+ },
873
+ {
874
+ "epoch": 0.9656441902922582,
875
+ "eval_loss": 2.0152089595794678,
876
+ "eval_mean_token_accuracy": 0.5734592423989222,
877
+ "eval_num_tokens": 814660681.0,
878
+ "eval_runtime": 126.585,
879
+ "eval_samples_per_second": 84.623,
880
+ "eval_steps_per_second": 21.156,
881
+ "num_input_tokens_seen": 1932223680,
882
+ "step": 4000
883
  }
884
  ],
885
  "logging_steps": 50,
886
  "max_steps": 16568,
887
+ "num_input_tokens_seen": 1932223680,
888
  "num_train_epochs": 4,
889
  "save_steps": 1000,
890
  "stateful_callbacks": {
 
899
  "attributes": {}
900
  }
901
  },
902
+ "total_flos": 5.168886929031168e+17,
903
  "train_batch_size": 16,
904
  "trial_name": null,
905
  "trial_params": null