Azrail commited on
Commit
3a55628
·
verified ·
1 Parent(s): 8f5af64

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a313d509db7def5f49214f9d05b89c42300ce0ca3fd0d7a1b4c56154cf0a72db
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37296df7790e03d83312df3152295ea5675574fc24606e23051f92ba2a8785cd
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ad0ef37ec0c5bff68abf0acafb2e524cd857e55490e94ef61cc44d1f7b08679
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddcb0501b4cfee967f8db5bb4fa8fb92b655ff610b064b96b43d66b2ba0fdac4
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b2c7bb39719c8f039a2a4dd5473921c41a834e3390491b2b93e9a2772ee802f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c247dc7c4172df7c1d104b1da0eaec0df0b665cbc24707f3227675351f1df9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bff7808903acfc88c8f83b83043a92f900db8f72ffc7d87d61c8ee1abceef7bc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ad5c996b0875772675f1bc75e15a0dbeb09c5ba7146d169befa6908149e4159
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.08786417297160752,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -720,11 +720,189 @@
720
  "eval_steps_per_second": 19.053,
721
  "num_input_tokens_seen": 4194304000,
722
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  }
724
  ],
725
  "logging_steps": 50,
726
  "max_steps": 200000,
727
- "num_input_tokens_seen": 4194304000,
728
  "num_train_epochs": 5,
729
  "save_steps": 1000,
730
  "stateful_callbacks": {
@@ -739,7 +917,7 @@
739
  "attributes": {}
740
  }
741
  },
742
- "total_flos": 2.388686863859712e+18,
743
  "train_batch_size": 64,
744
  "trial_name": null,
745
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.10983021621450939,
6
  "eval_steps": 500,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
720
  "eval_steps_per_second": 19.053,
721
  "num_input_tokens_seen": 4194304000,
722
  "step": 4000
723
+ },
724
+ {
725
+ "epoch": 0.0889624751337526,
726
+ "grad_norm": 0.1839856207370758,
727
+ "learning_rate": 0.0008100000000000001,
728
+ "loss": 3.0862,
729
+ "num_input_tokens_seen": 4246732800,
730
+ "step": 4050
731
+ },
732
+ {
733
+ "epoch": 0.0900607772958977,
734
+ "grad_norm": 0.17331145703792572,
735
+ "learning_rate": 0.00082,
736
+ "loss": 3.087,
737
+ "num_input_tokens_seen": 4299161600,
738
+ "step": 4100
739
+ },
740
+ {
741
+ "epoch": 0.0911590794580428,
742
+ "grad_norm": 0.18384258449077606,
743
+ "learning_rate": 0.00083,
744
+ "loss": 3.076,
745
+ "num_input_tokens_seen": 4351590400,
746
+ "step": 4150
747
+ },
748
+ {
749
+ "epoch": 0.09225738162018789,
750
+ "grad_norm": 0.17061170935630798,
751
+ "learning_rate": 0.00084,
752
+ "loss": 3.0693,
753
+ "num_input_tokens_seen": 4404019200,
754
+ "step": 4200
755
+ },
756
+ {
757
+ "epoch": 0.09335568378233298,
758
+ "grad_norm": 0.18157647550106049,
759
+ "learning_rate": 0.00085,
760
+ "loss": 3.0698,
761
+ "num_input_tokens_seen": 4456448000,
762
+ "step": 4250
763
+ },
764
+ {
765
+ "epoch": 0.09445398594447808,
766
+ "grad_norm": 0.15678547322750092,
767
+ "learning_rate": 0.00086,
768
+ "loss": 3.064,
769
+ "num_input_tokens_seen": 4508876800,
770
+ "step": 4300
771
+ },
772
+ {
773
+ "epoch": 0.09555228810662317,
774
+ "grad_norm": 0.19118325412273407,
775
+ "learning_rate": 0.00087,
776
+ "loss": 3.0541,
777
+ "num_input_tokens_seen": 4561305600,
778
+ "step": 4350
779
+ },
780
+ {
781
+ "epoch": 0.09665059026876827,
782
+ "grad_norm": 0.17620691657066345,
783
+ "learning_rate": 0.00088,
784
+ "loss": 3.0532,
785
+ "num_input_tokens_seen": 4613734400,
786
+ "step": 4400
787
+ },
788
+ {
789
+ "epoch": 0.09774889243091336,
790
+ "grad_norm": 0.17351101338863373,
791
+ "learning_rate": 0.0008900000000000001,
792
+ "loss": 3.0549,
793
+ "num_input_tokens_seen": 4666163200,
794
+ "step": 4450
795
+ },
796
+ {
797
+ "epoch": 0.09884719459305845,
798
+ "grad_norm": 0.15183581411838531,
799
+ "learning_rate": 0.0009000000000000001,
800
+ "loss": 3.0485,
801
+ "num_input_tokens_seen": 4718592000,
802
+ "step": 4500
803
+ },
804
+ {
805
+ "epoch": 0.09884719459305845,
806
+ "eval_loss": 2.9479379653930664,
807
+ "eval_runtime": 66.5611,
808
+ "eval_samples_per_second": 75.119,
809
+ "eval_steps_per_second": 18.78,
810
+ "num_input_tokens_seen": 4718592000,
811
+ "step": 4500
812
+ },
813
+ {
814
+ "epoch": 0.09994549675520355,
815
+ "grad_norm": 0.1681961864233017,
816
+ "learning_rate": 0.00091,
817
+ "loss": 3.0395,
818
+ "num_input_tokens_seen": 4771020800,
819
+ "step": 4550
820
+ },
821
+ {
822
+ "epoch": 0.10104379891734865,
823
+ "grad_norm": 0.17382557690143585,
824
+ "learning_rate": 0.00092,
825
+ "loss": 3.0371,
826
+ "num_input_tokens_seen": 4823449600,
827
+ "step": 4600
828
+ },
829
+ {
830
+ "epoch": 0.10214210107949374,
831
+ "grad_norm": 0.14377906918525696,
832
+ "learning_rate": 0.00093,
833
+ "loss": 3.0377,
834
+ "num_input_tokens_seen": 4875878400,
835
+ "step": 4650
836
+ },
837
+ {
838
+ "epoch": 0.10324040324163883,
839
+ "grad_norm": 0.1590214967727661,
840
+ "learning_rate": 0.00094,
841
+ "loss": 3.0305,
842
+ "num_input_tokens_seen": 4928307200,
843
+ "step": 4700
844
+ },
845
+ {
846
+ "epoch": 0.10433870540378393,
847
+ "grad_norm": 0.15563353896141052,
848
+ "learning_rate": 0.00095,
849
+ "loss": 3.0254,
850
+ "num_input_tokens_seen": 4980736000,
851
+ "step": 4750
852
+ },
853
+ {
854
+ "epoch": 0.10543700756592903,
855
+ "grad_norm": 0.16002103686332703,
856
+ "learning_rate": 0.00096,
857
+ "loss": 3.0222,
858
+ "num_input_tokens_seen": 5033164800,
859
+ "step": 4800
860
+ },
861
+ {
862
+ "epoch": 0.10653530972807411,
863
+ "grad_norm": 0.1406039148569107,
864
+ "learning_rate": 0.0009699999999999999,
865
+ "loss": 3.0185,
866
+ "num_input_tokens_seen": 5085593600,
867
+ "step": 4850
868
+ },
869
+ {
870
+ "epoch": 0.10763361189021921,
871
+ "grad_norm": 0.14609627425670624,
872
+ "learning_rate": 0.00098,
873
+ "loss": 3.0177,
874
+ "num_input_tokens_seen": 5138022400,
875
+ "step": 4900
876
+ },
877
+ {
878
+ "epoch": 0.1087319140523643,
879
+ "grad_norm": 0.16061657667160034,
880
+ "learning_rate": 0.00099,
881
+ "loss": 3.0137,
882
+ "num_input_tokens_seen": 5190451200,
883
+ "step": 4950
884
+ },
885
+ {
886
+ "epoch": 0.10983021621450939,
887
+ "grad_norm": 0.18423974514007568,
888
+ "learning_rate": 0.001,
889
+ "loss": 3.016,
890
+ "num_input_tokens_seen": 5242880000,
891
+ "step": 5000
892
+ },
893
+ {
894
+ "epoch": 0.10983021621450939,
895
+ "eval_loss": 2.9132862091064453,
896
+ "eval_runtime": 65.7163,
897
+ "eval_samples_per_second": 76.085,
898
+ "eval_steps_per_second": 19.021,
899
+ "num_input_tokens_seen": 5242880000,
900
+ "step": 5000
901
  }
902
  ],
903
  "logging_steps": 50,
904
  "max_steps": 200000,
905
+ "num_input_tokens_seen": 5242880000,
906
  "num_train_epochs": 5,
907
  "save_steps": 1000,
908
  "stateful_callbacks": {
 
917
  "attributes": {}
918
  }
919
  },
920
+ "total_flos": 2.98585857982464e+18,
921
  "train_batch_size": 64,
922
  "trial_name": null,
923
  "trial_params": null