CocoRoF commited on
Commit
81dc68b
·
verified ·
1 Parent(s): 76cd9a7

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa115a9b44eddd31239249af967a9cc68c05d2e90e11fed79b56ff0bcbe835bb
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6415eeae002a06eaee3e6c8d4cb9f7505094d1f36ee4e29e435d17300ed0530d
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc218283c198560bdfa85f3ab5c42a23aac9930dbda514ca712373372068b858
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a52ae8230dd79da1bc508a923dbccfd0826e740333dd2ce1ba9f9bed20ef632c
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78d3f197f6c6558fa8056324f1563ab9e957255f5a1a959362aa4eed7a9545db
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c1a9c65c2869356282cad6b4a0f7dff7f4dd68ab3d9d216c72b7d6cb524f860
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:896febe768e17bae5022a95960c041f6425783774ec8859d99d3b149063b1bf9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eac482d57e966585467c8ef44dae2869bf7e5d92886f69c11ed7bccc34c07efe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1f27d227a20dc320ac283e0938fb2f6e5b475829a583f8c44d1a16a8c828307
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05a7106aaeaec4b81704e3f4a998b5123cf9342a6733bd9fd2d578e99108c3b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94120d8d88502ec8d8b623ec7550315caca003b44fcffbb5767ab0de91baefe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:332e4d901be380f740b5d8578f7b80ef1865c7fba83bc288c8a35852205cc668
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ac6dc5dc1f7c6772a7fe1da65c3c395e23b56d006497e0dee9efefb278bb143
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf407b3cceb49ac6bb524b596a50ae77a5444821a3bb5dab22e8c85264a65a8c
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.362499433594635,
5
  "eval_steps": 500,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -715,6 +715,714 @@
715
  "eval_samples_per_second": 595.745,
716
  "eval_steps_per_second": 37.241,
717
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 5,
@@ -734,7 +1442,7 @@
734
  "attributes": {}
735
  }
736
  },
737
- "total_flos": 2.1661789963943936e+18,
738
  "train_batch_size": 4,
739
  "trial_name": null,
740
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.72499886718927,
5
  "eval_steps": 500,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
715
  "eval_samples_per_second": 595.745,
716
  "eval_steps_per_second": 37.241,
717
  "step": 500
718
+ },
719
+ {
720
+ "epoch": 0.36612442793058136,
721
+ "grad_norm": 100.625,
722
+ "learning_rate": 2.112812248186946e-06,
723
+ "loss": 90.367,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 0.3697494222665277,
728
+ "grad_norm": 99.125,
729
+ "learning_rate": 2.1007252215954874e-06,
730
+ "loss": 89.764,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 0.37337441660247406,
735
+ "grad_norm": 98.0625,
736
+ "learning_rate": 2.0886381950040293e-06,
737
+ "loss": 90.377,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 0.37699941093842043,
742
+ "grad_norm": 97.125,
743
+ "learning_rate": 2.0765511684125707e-06,
744
+ "loss": 89.3218,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 0.38062440527436675,
749
+ "grad_norm": 99.1875,
750
+ "learning_rate": 2.064464141821112e-06,
751
+ "loss": 90.3385,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 0.3842493996103131,
756
+ "grad_norm": 101.125,
757
+ "learning_rate": 2.0523771152296536e-06,
758
+ "loss": 88.8969,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 0.38787439394625944,
763
+ "grad_norm": 92.8125,
764
+ "learning_rate": 2.040290088638195e-06,
765
+ "loss": 90.3189,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 0.3914993882822058,
770
+ "grad_norm": 93.0625,
771
+ "learning_rate": 2.0282030620467365e-06,
772
+ "loss": 88.3214,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 0.39512438261815214,
777
+ "grad_norm": 96.0,
778
+ "learning_rate": 2.016116035455278e-06,
779
+ "loss": 90.5051,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 0.3987493769540985,
784
+ "grad_norm": 97.75,
785
+ "learning_rate": 2.0040290088638194e-06,
786
+ "loss": 89.0853,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 0.4023743712900449,
791
+ "grad_norm": 96.125,
792
+ "learning_rate": 1.991941982272361e-06,
793
+ "loss": 89.5784,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 0.4059993656259912,
798
+ "grad_norm": 93.5625,
799
+ "learning_rate": 1.9798549556809023e-06,
800
+ "loss": 89.3408,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 0.4096243599619376,
805
+ "grad_norm": 95.1875,
806
+ "learning_rate": 1.9677679290894438e-06,
807
+ "loss": 89.4404,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 0.4132493542978839,
812
+ "grad_norm": 99.1875,
813
+ "learning_rate": 1.9556809024979856e-06,
814
+ "loss": 89.5664,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 0.41687434863383027,
819
+ "grad_norm": 90.1875,
820
+ "learning_rate": 1.943593875906527e-06,
821
+ "loss": 89.5359,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 0.4204993429697766,
826
+ "grad_norm": 100.625,
827
+ "learning_rate": 1.9315068493150685e-06,
828
+ "loss": 88.9519,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 0.42412433730572296,
833
+ "grad_norm": 95.375,
834
+ "learning_rate": 1.91941982272361e-06,
835
+ "loss": 89.6489,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 0.4277493316416693,
840
+ "grad_norm": 100.0,
841
+ "learning_rate": 1.9073327961321514e-06,
842
+ "loss": 88.3665,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 0.43137432597761566,
847
+ "grad_norm": 92.5,
848
+ "learning_rate": 1.8952457695406929e-06,
849
+ "loss": 87.8933,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 0.43499932031356203,
854
+ "grad_norm": 95.125,
855
+ "learning_rate": 1.8831587429492343e-06,
856
+ "loss": 89.0427,
857
+ "step": 600
858
+ },
859
+ {
860
+ "epoch": 0.43862431464950835,
861
+ "grad_norm": 93.5625,
862
+ "learning_rate": 1.8710717163577762e-06,
863
+ "loss": 88.9729,
864
+ "step": 605
865
+ },
866
+ {
867
+ "epoch": 0.4422493089854547,
868
+ "grad_norm": 94.125,
869
+ "learning_rate": 1.8589846897663176e-06,
870
+ "loss": 88.4542,
871
+ "step": 610
872
+ },
873
+ {
874
+ "epoch": 0.44587430332140104,
875
+ "grad_norm": 94.25,
876
+ "learning_rate": 1.846897663174859e-06,
877
+ "loss": 87.9236,
878
+ "step": 615
879
+ },
880
+ {
881
+ "epoch": 0.4494992976573474,
882
+ "grad_norm": 96.3125,
883
+ "learning_rate": 1.8348106365834005e-06,
884
+ "loss": 88.8232,
885
+ "step": 620
886
+ },
887
+ {
888
+ "epoch": 0.45312429199329374,
889
+ "grad_norm": 92.875,
890
+ "learning_rate": 1.822723609991942e-06,
891
+ "loss": 87.6141,
892
+ "step": 625
893
+ },
894
+ {
895
+ "epoch": 0.4567492863292401,
896
+ "grad_norm": 94.8125,
897
+ "learning_rate": 1.8106365834004834e-06,
898
+ "loss": 89.5391,
899
+ "step": 630
900
+ },
901
+ {
902
+ "epoch": 0.4603742806651865,
903
+ "grad_norm": 96.625,
904
+ "learning_rate": 1.7985495568090249e-06,
905
+ "loss": 87.6836,
906
+ "step": 635
907
+ },
908
+ {
909
+ "epoch": 0.4639992750011328,
910
+ "grad_norm": 98.5,
911
+ "learning_rate": 1.7864625302175668e-06,
912
+ "loss": 88.0843,
913
+ "step": 640
914
+ },
915
+ {
916
+ "epoch": 0.4676242693370792,
917
+ "grad_norm": 95.4375,
918
+ "learning_rate": 1.7743755036261082e-06,
919
+ "loss": 88.7523,
920
+ "step": 645
921
+ },
922
+ {
923
+ "epoch": 0.4712492636730255,
924
+ "grad_norm": 96.9375,
925
+ "learning_rate": 1.7622884770346497e-06,
926
+ "loss": 88.2255,
927
+ "step": 650
928
+ },
929
+ {
930
+ "epoch": 0.4748742580089719,
931
+ "grad_norm": 95.75,
932
+ "learning_rate": 1.750201450443191e-06,
933
+ "loss": 88.399,
934
+ "step": 655
935
+ },
936
+ {
937
+ "epoch": 0.4784992523449182,
938
+ "grad_norm": 99.4375,
939
+ "learning_rate": 1.7381144238517325e-06,
940
+ "loss": 86.5224,
941
+ "step": 660
942
+ },
943
+ {
944
+ "epoch": 0.48212424668086457,
945
+ "grad_norm": 93.125,
946
+ "learning_rate": 1.726027397260274e-06,
947
+ "loss": 86.3948,
948
+ "step": 665
949
+ },
950
+ {
951
+ "epoch": 0.4857492410168109,
952
+ "grad_norm": 96.3125,
953
+ "learning_rate": 1.7139403706688154e-06,
954
+ "loss": 87.6118,
955
+ "step": 670
956
+ },
957
+ {
958
+ "epoch": 0.48937423535275726,
959
+ "grad_norm": 95.8125,
960
+ "learning_rate": 1.701853344077357e-06,
961
+ "loss": 86.8485,
962
+ "step": 675
963
+ },
964
+ {
965
+ "epoch": 0.49299922968870363,
966
+ "grad_norm": 91.3125,
967
+ "learning_rate": 1.6897663174858985e-06,
968
+ "loss": 88.4679,
969
+ "step": 680
970
+ },
971
+ {
972
+ "epoch": 0.49662422402464995,
973
+ "grad_norm": 93.9375,
974
+ "learning_rate": 1.67767929089444e-06,
975
+ "loss": 87.9796,
976
+ "step": 685
977
+ },
978
+ {
979
+ "epoch": 0.5002492183605963,
980
+ "grad_norm": 97.3125,
981
+ "learning_rate": 1.6655922643029814e-06,
982
+ "loss": 87.4862,
983
+ "step": 690
984
+ },
985
+ {
986
+ "epoch": 0.5038742126965426,
987
+ "grad_norm": 91.125,
988
+ "learning_rate": 1.6535052377115229e-06,
989
+ "loss": 86.7279,
990
+ "step": 695
991
+ },
992
+ {
993
+ "epoch": 0.507499207032489,
994
+ "grad_norm": 94.5,
995
+ "learning_rate": 1.6414182111200643e-06,
996
+ "loss": 87.0621,
997
+ "step": 700
998
+ },
999
+ {
1000
+ "epoch": 0.5111242013684354,
1001
+ "grad_norm": 95.5,
1002
+ "learning_rate": 1.6293311845286058e-06,
1003
+ "loss": 87.104,
1004
+ "step": 705
1005
+ },
1006
+ {
1007
+ "epoch": 0.5147491957043817,
1008
+ "grad_norm": 94.9375,
1009
+ "learning_rate": 1.6172441579371477e-06,
1010
+ "loss": 86.7461,
1011
+ "step": 710
1012
+ },
1013
+ {
1014
+ "epoch": 0.518374190040328,
1015
+ "grad_norm": 93.125,
1016
+ "learning_rate": 1.605157131345689e-06,
1017
+ "loss": 87.189,
1018
+ "step": 715
1019
+ },
1020
+ {
1021
+ "epoch": 0.5219991843762745,
1022
+ "grad_norm": 96.5625,
1023
+ "learning_rate": 1.5930701047542306e-06,
1024
+ "loss": 86.5543,
1025
+ "step": 720
1026
+ },
1027
+ {
1028
+ "epoch": 0.5256241787122208,
1029
+ "grad_norm": 94.5,
1030
+ "learning_rate": 1.580983078162772e-06,
1031
+ "loss": 87.1241,
1032
+ "step": 725
1033
+ },
1034
+ {
1035
+ "epoch": 0.5292491730481671,
1036
+ "grad_norm": 93.375,
1037
+ "learning_rate": 1.5688960515713134e-06,
1038
+ "loss": 86.1505,
1039
+ "step": 730
1040
+ },
1041
+ {
1042
+ "epoch": 0.5328741673841134,
1043
+ "grad_norm": 94.625,
1044
+ "learning_rate": 1.556809024979855e-06,
1045
+ "loss": 87.0848,
1046
+ "step": 735
1047
+ },
1048
+ {
1049
+ "epoch": 0.5364991617200598,
1050
+ "grad_norm": 96.375,
1051
+ "learning_rate": 1.5447219983883963e-06,
1052
+ "loss": 86.5279,
1053
+ "step": 740
1054
+ },
1055
+ {
1056
+ "epoch": 0.5401241560560062,
1057
+ "grad_norm": 96.875,
1058
+ "learning_rate": 1.5326349717969382e-06,
1059
+ "loss": 87.5163,
1060
+ "step": 745
1061
+ },
1062
+ {
1063
+ "epoch": 0.5437491503919525,
1064
+ "grad_norm": 96.75,
1065
+ "learning_rate": 1.5205479452054797e-06,
1066
+ "loss": 84.3448,
1067
+ "step": 750
1068
+ },
1069
+ {
1070
+ "epoch": 0.5473741447278989,
1071
+ "grad_norm": 99.0,
1072
+ "learning_rate": 1.5084609186140211e-06,
1073
+ "loss": 86.8625,
1074
+ "step": 755
1075
+ },
1076
+ {
1077
+ "epoch": 0.5509991390638452,
1078
+ "grad_norm": 93.5,
1079
+ "learning_rate": 1.4963738920225626e-06,
1080
+ "loss": 85.2925,
1081
+ "step": 760
1082
+ },
1083
+ {
1084
+ "epoch": 0.5546241333997916,
1085
+ "grad_norm": 94.3125,
1086
+ "learning_rate": 1.484286865431104e-06,
1087
+ "loss": 87.5748,
1088
+ "step": 765
1089
+ },
1090
+ {
1091
+ "epoch": 0.5582491277357379,
1092
+ "grad_norm": 92.75,
1093
+ "learning_rate": 1.4721998388396455e-06,
1094
+ "loss": 86.6436,
1095
+ "step": 770
1096
+ },
1097
+ {
1098
+ "epoch": 0.5618741220716843,
1099
+ "grad_norm": 95.875,
1100
+ "learning_rate": 1.460112812248187e-06,
1101
+ "loss": 85.7076,
1102
+ "step": 775
1103
+ },
1104
+ {
1105
+ "epoch": 0.5654991164076306,
1106
+ "grad_norm": 96.625,
1107
+ "learning_rate": 1.4480257856567283e-06,
1108
+ "loss": 86.4671,
1109
+ "step": 780
1110
+ },
1111
+ {
1112
+ "epoch": 0.5691241107435769,
1113
+ "grad_norm": 95.375,
1114
+ "learning_rate": 1.43593875906527e-06,
1115
+ "loss": 85.973,
1116
+ "step": 785
1117
+ },
1118
+ {
1119
+ "epoch": 0.5727491050795234,
1120
+ "grad_norm": 95.75,
1121
+ "learning_rate": 1.4238517324738115e-06,
1122
+ "loss": 84.9478,
1123
+ "step": 790
1124
+ },
1125
+ {
1126
+ "epoch": 0.5763740994154697,
1127
+ "grad_norm": 94.875,
1128
+ "learning_rate": 1.411764705882353e-06,
1129
+ "loss": 85.6088,
1130
+ "step": 795
1131
+ },
1132
+ {
1133
+ "epoch": 0.579999093751416,
1134
+ "grad_norm": 93.75,
1135
+ "learning_rate": 1.3996776792908943e-06,
1136
+ "loss": 86.5241,
1137
+ "step": 800
1138
+ },
1139
+ {
1140
+ "epoch": 0.5836240880873623,
1141
+ "grad_norm": 93.0625,
1142
+ "learning_rate": 1.387590652699436e-06,
1143
+ "loss": 85.3578,
1144
+ "step": 805
1145
+ },
1146
+ {
1147
+ "epoch": 0.5872490824233088,
1148
+ "grad_norm": 94.25,
1149
+ "learning_rate": 1.3755036261079775e-06,
1150
+ "loss": 86.0338,
1151
+ "step": 810
1152
+ },
1153
+ {
1154
+ "epoch": 0.5908740767592551,
1155
+ "grad_norm": 92.6875,
1156
+ "learning_rate": 1.363416599516519e-06,
1157
+ "loss": 84.5091,
1158
+ "step": 815
1159
+ },
1160
+ {
1161
+ "epoch": 0.5944990710952014,
1162
+ "grad_norm": 92.8125,
1163
+ "learning_rate": 1.3513295729250606e-06,
1164
+ "loss": 85.7377,
1165
+ "step": 820
1166
+ },
1167
+ {
1168
+ "epoch": 0.5981240654311477,
1169
+ "grad_norm": 91.125,
1170
+ "learning_rate": 1.339242546333602e-06,
1171
+ "loss": 84.7625,
1172
+ "step": 825
1173
+ },
1174
+ {
1175
+ "epoch": 0.6017490597670941,
1176
+ "grad_norm": 93.1875,
1177
+ "learning_rate": 1.3271555197421435e-06,
1178
+ "loss": 84.8545,
1179
+ "step": 830
1180
+ },
1181
+ {
1182
+ "epoch": 0.6053740541030405,
1183
+ "grad_norm": 94.8125,
1184
+ "learning_rate": 1.315068493150685e-06,
1185
+ "loss": 85.5188,
1186
+ "step": 835
1187
+ },
1188
+ {
1189
+ "epoch": 0.6089990484389868,
1190
+ "grad_norm": 91.75,
1191
+ "learning_rate": 1.3029814665592266e-06,
1192
+ "loss": 86.0601,
1193
+ "step": 840
1194
+ },
1195
+ {
1196
+ "epoch": 0.6126240427749332,
1197
+ "grad_norm": 94.375,
1198
+ "learning_rate": 1.290894439967768e-06,
1199
+ "loss": 85.4887,
1200
+ "step": 845
1201
+ },
1202
+ {
1203
+ "epoch": 0.6162490371108795,
1204
+ "grad_norm": 95.375,
1205
+ "learning_rate": 1.2788074133763095e-06,
1206
+ "loss": 85.1952,
1207
+ "step": 850
1208
+ },
1209
+ {
1210
+ "epoch": 0.6198740314468258,
1211
+ "grad_norm": 95.1875,
1212
+ "learning_rate": 1.2667203867848511e-06,
1213
+ "loss": 84.1906,
1214
+ "step": 855
1215
+ },
1216
+ {
1217
+ "epoch": 0.6234990257827722,
1218
+ "grad_norm": 92.8125,
1219
+ "learning_rate": 1.2546333601933926e-06,
1220
+ "loss": 84.6792,
1221
+ "step": 860
1222
+ },
1223
+ {
1224
+ "epoch": 0.6271240201187186,
1225
+ "grad_norm": 92.625,
1226
+ "learning_rate": 1.242546333601934e-06,
1227
+ "loss": 85.1767,
1228
+ "step": 865
1229
+ },
1230
+ {
1231
+ "epoch": 0.6307490144546649,
1232
+ "grad_norm": 91.3125,
1233
+ "learning_rate": 1.2304593070104755e-06,
1234
+ "loss": 84.0814,
1235
+ "step": 870
1236
+ },
1237
+ {
1238
+ "epoch": 0.6343740087906112,
1239
+ "grad_norm": 96.1875,
1240
+ "learning_rate": 1.218372280419017e-06,
1241
+ "loss": 84.6096,
1242
+ "step": 875
1243
+ },
1244
+ {
1245
+ "epoch": 0.6379990031265577,
1246
+ "grad_norm": 90.5625,
1247
+ "learning_rate": 1.2062852538275584e-06,
1248
+ "loss": 84.8486,
1249
+ "step": 880
1250
+ },
1251
+ {
1252
+ "epoch": 0.641623997462504,
1253
+ "grad_norm": 92.6875,
1254
+ "learning_rate": 1.1941982272360998e-06,
1255
+ "loss": 84.4379,
1256
+ "step": 885
1257
+ },
1258
+ {
1259
+ "epoch": 0.6452489917984503,
1260
+ "grad_norm": 96.5,
1261
+ "learning_rate": 1.1821112006446415e-06,
1262
+ "loss": 85.6003,
1263
+ "step": 890
1264
+ },
1265
+ {
1266
+ "epoch": 0.6488739861343966,
1267
+ "grad_norm": 96.25,
1268
+ "learning_rate": 1.170024174053183e-06,
1269
+ "loss": 83.5816,
1270
+ "step": 895
1271
+ },
1272
+ {
1273
+ "epoch": 0.652498980470343,
1274
+ "grad_norm": 90.25,
1275
+ "learning_rate": 1.1579371474617244e-06,
1276
+ "loss": 84.2162,
1277
+ "step": 900
1278
+ },
1279
+ {
1280
+ "epoch": 0.6561239748062894,
1281
+ "grad_norm": 98.5,
1282
+ "learning_rate": 1.1458501208702658e-06,
1283
+ "loss": 83.1881,
1284
+ "step": 905
1285
+ },
1286
+ {
1287
+ "epoch": 0.6597489691422357,
1288
+ "grad_norm": 92.0,
1289
+ "learning_rate": 1.1337630942788075e-06,
1290
+ "loss": 84.7499,
1291
+ "step": 910
1292
+ },
1293
+ {
1294
+ "epoch": 0.6633739634781821,
1295
+ "grad_norm": 93.3125,
1296
+ "learning_rate": 1.121676067687349e-06,
1297
+ "loss": 83.7881,
1298
+ "step": 915
1299
+ },
1300
+ {
1301
+ "epoch": 0.6669989578141284,
1302
+ "grad_norm": 94.75,
1303
+ "learning_rate": 1.1095890410958904e-06,
1304
+ "loss": 83.8075,
1305
+ "step": 920
1306
+ },
1307
+ {
1308
+ "epoch": 0.6706239521500748,
1309
+ "grad_norm": 93.4375,
1310
+ "learning_rate": 1.097502014504432e-06,
1311
+ "loss": 83.9208,
1312
+ "step": 925
1313
+ },
1314
+ {
1315
+ "epoch": 0.6742489464860211,
1316
+ "grad_norm": 96.0,
1317
+ "learning_rate": 1.0854149879129735e-06,
1318
+ "loss": 83.8316,
1319
+ "step": 930
1320
+ },
1321
+ {
1322
+ "epoch": 0.6778739408219675,
1323
+ "grad_norm": 97.375,
1324
+ "learning_rate": 1.073327961321515e-06,
1325
+ "loss": 83.665,
1326
+ "step": 935
1327
+ },
1328
+ {
1329
+ "epoch": 0.6814989351579138,
1330
+ "grad_norm": 93.875,
1331
+ "learning_rate": 1.0612409347300566e-06,
1332
+ "loss": 84.09,
1333
+ "step": 940
1334
+ },
1335
+ {
1336
+ "epoch": 0.6851239294938601,
1337
+ "grad_norm": 95.125,
1338
+ "learning_rate": 1.049153908138598e-06,
1339
+ "loss": 83.6363,
1340
+ "step": 945
1341
+ },
1342
+ {
1343
+ "epoch": 0.6887489238298066,
1344
+ "grad_norm": 93.5,
1345
+ "learning_rate": 1.0370668815471395e-06,
1346
+ "loss": 82.6956,
1347
+ "step": 950
1348
+ },
1349
+ {
1350
+ "epoch": 0.6923739181657529,
1351
+ "grad_norm": 91.5,
1352
+ "learning_rate": 1.024979854955681e-06,
1353
+ "loss": 83.8282,
1354
+ "step": 955
1355
+ },
1356
+ {
1357
+ "epoch": 0.6959989125016992,
1358
+ "grad_norm": 101.25,
1359
+ "learning_rate": 1.0128928283642226e-06,
1360
+ "loss": 83.393,
1361
+ "step": 960
1362
+ },
1363
+ {
1364
+ "epoch": 0.6996239068376455,
1365
+ "grad_norm": 93.6875,
1366
+ "learning_rate": 1.000805801772764e-06,
1367
+ "loss": 82.5698,
1368
+ "step": 965
1369
+ },
1370
+ {
1371
+ "epoch": 0.703248901173592,
1372
+ "grad_norm": 94.3125,
1373
+ "learning_rate": 9.887187751813055e-07,
1374
+ "loss": 82.9595,
1375
+ "step": 970
1376
+ },
1377
+ {
1378
+ "epoch": 0.7068738955095383,
1379
+ "grad_norm": 93.5,
1380
+ "learning_rate": 9.76631748589847e-07,
1381
+ "loss": 82.2969,
1382
+ "step": 975
1383
+ },
1384
+ {
1385
+ "epoch": 0.7104988898454846,
1386
+ "grad_norm": 91.125,
1387
+ "learning_rate": 9.645447219983884e-07,
1388
+ "loss": 84.9823,
1389
+ "step": 980
1390
+ },
1391
+ {
1392
+ "epoch": 0.7141238841814309,
1393
+ "grad_norm": 93.0,
1394
+ "learning_rate": 9.524576954069299e-07,
1395
+ "loss": 84.2716,
1396
+ "step": 985
1397
+ },
1398
+ {
1399
+ "epoch": 0.7177488785173773,
1400
+ "grad_norm": 90.5,
1401
+ "learning_rate": 9.403706688154714e-07,
1402
+ "loss": 84.1091,
1403
+ "step": 990
1404
+ },
1405
+ {
1406
+ "epoch": 0.7213738728533237,
1407
+ "grad_norm": 94.5625,
1408
+ "learning_rate": 9.282836422240129e-07,
1409
+ "loss": 82.6404,
1410
+ "step": 995
1411
+ },
1412
+ {
1413
+ "epoch": 0.72499886718927,
1414
+ "grad_norm": 94.5625,
1415
+ "learning_rate": 9.161966156325544e-07,
1416
+ "loss": 82.809,
1417
+ "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 0.72499886718927,
1421
+ "eval_loss": NaN,
1422
+ "eval_runtime": 124.4615,
1423
+ "eval_samples_per_second": 597.277,
1424
+ "eval_steps_per_second": 37.337,
1425
+ "step": 1000
1426
  }
1427
  ],
1428
  "logging_steps": 5,
 
1442
  "attributes": {}
1443
  }
1444
  },
1445
+ "total_flos": 4.332357992788787e+18,
1446
  "train_batch_size": 4,
1447
  "trial_name": null,
1448
  "trial_params": null