CocoRoF commited on
Commit
2897e26
·
verified ·
1 Parent(s): 2b81cf5

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b1a0ac2f1926cc0a1931002d53deb7fb42e18ad3c8491e927c266910d415b2d
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd78f70bc690f6c96ef8249e16cd3eaed70120ea55548486454d9b19a469e2f9
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a826f1d681160cd4b89a4d224d29d3dd35f09d4180f8b412d770bcd98b2e00ef
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdfdcc66bbfd0f7601e8207cf92cc9b52dd079dc47d6e19e50c732b619c60dd8
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78d3f197f6c6558fa8056324f1563ab9e957255f5a1a959362aa4eed7a9545db
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c1a9c65c2869356282cad6b4a0f7dff7f4dd68ab3d9d216c72b7d6cb524f860
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:896febe768e17bae5022a95960c041f6425783774ec8859d99d3b149063b1bf9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eac482d57e966585467c8ef44dae2869bf7e5d92886f69c11ed7bccc34c07efe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1f27d227a20dc320ac283e0938fb2f6e5b475829a583f8c44d1a16a8c828307
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05a7106aaeaec4b81704e3f4a998b5123cf9342a6733bd9fd2d578e99108c3b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94120d8d88502ec8d8b623ec7550315caca003b44fcffbb5767ab0de91baefe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:332e4d901be380f740b5d8578f7b80ef1865c7fba83bc288c8a35852205cc668
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3298de4dad69614ea1cbba5a0efb97f176384fffada5231f6dba076b7c6e5edf
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb96e4663ec39ec9f3920ec91963e2a7dc93e87ff87c51ea8d11abe793c9c1d1
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4452111970616061,
5
  "eval_steps": 500,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -715,6 +715,714 @@
715
  "eval_samples_per_second": 595.254,
716
  "eval_steps_per_second": 37.204,
717
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 5,
@@ -734,7 +1442,7 @@
734
  "attributes": {}
735
  }
736
  },
737
- "total_flos": 2.1661789963943936e+18,
738
  "train_batch_size": 4,
739
  "trial_name": null,
740
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8904223941232122,
5
  "eval_steps": 500,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
715
  "eval_samples_per_second": 595.254,
716
  "eval_steps_per_second": 37.204,
717
  "step": 500
718
+ },
719
+ {
720
+ "epoch": 0.4496633090322222,
721
+ "grad_norm": 153.0,
722
+ "learning_rate": 1.8356435643564356e-06,
723
+ "loss": 97.0697,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 0.45411542100283825,
728
+ "grad_norm": 152.375,
729
+ "learning_rate": 1.8207920792079207e-06,
730
+ "loss": 96.6341,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 0.45856753297345426,
735
+ "grad_norm": 148.625,
736
+ "learning_rate": 1.8059405940594058e-06,
737
+ "loss": 97.238,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 0.46301964494407033,
742
+ "grad_norm": 156.5,
743
+ "learning_rate": 1.7910891089108912e-06,
744
+ "loss": 97.0582,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 0.4674717569146864,
749
+ "grad_norm": 156.875,
750
+ "learning_rate": 1.7762376237623763e-06,
751
+ "loss": 99.9485,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 0.47192386888530247,
756
+ "grad_norm": 150.5,
757
+ "learning_rate": 1.7613861386138614e-06,
758
+ "loss": 98.0501,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 0.47637598085591853,
763
+ "grad_norm": 149.0,
764
+ "learning_rate": 1.7465346534653465e-06,
765
+ "loss": 97.4984,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 0.4808280928265346,
770
+ "grad_norm": 159.0,
771
+ "learning_rate": 1.7316831683168316e-06,
772
+ "loss": 94.833,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 0.48528020479715067,
777
+ "grad_norm": 154.0,
778
+ "learning_rate": 1.7168316831683167e-06,
779
+ "loss": 96.7634,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 0.4897323167677667,
784
+ "grad_norm": 145.75,
785
+ "learning_rate": 1.7019801980198019e-06,
786
+ "loss": 96.3564,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 0.49418442873838275,
791
+ "grad_norm": 150.75,
792
+ "learning_rate": 1.6871287128712874e-06,
793
+ "loss": 96.0712,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 0.4986365407089988,
798
+ "grad_norm": 149.625,
799
+ "learning_rate": 1.6722772277227725e-06,
800
+ "loss": 97.4836,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 0.5030886526796149,
805
+ "grad_norm": 145.375,
806
+ "learning_rate": 1.6574257425742576e-06,
807
+ "loss": 98.0151,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 0.5075407646502309,
812
+ "grad_norm": 156.875,
813
+ "learning_rate": 1.6425742574257427e-06,
814
+ "loss": 97.3034,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 0.511992876620847,
819
+ "grad_norm": 149.125,
820
+ "learning_rate": 1.6277227722772279e-06,
821
+ "loss": 97.3657,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 0.516444988591463,
826
+ "grad_norm": 153.0,
827
+ "learning_rate": 1.612871287128713e-06,
828
+ "loss": 96.8924,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 0.5208971005620792,
833
+ "grad_norm": 151.0,
834
+ "learning_rate": 1.598019801980198e-06,
835
+ "loss": 97.888,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 0.5253492125326952,
840
+ "grad_norm": 157.5,
841
+ "learning_rate": 1.5831683168316832e-06,
842
+ "loss": 95.7355,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 0.5298013245033113,
847
+ "grad_norm": 150.5,
848
+ "learning_rate": 1.5683168316831683e-06,
849
+ "loss": 94.4425,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 0.5342534364739273,
854
+ "grad_norm": 153.0,
855
+ "learning_rate": 1.5534653465346534e-06,
856
+ "loss": 96.4229,
857
+ "step": 600
858
+ },
859
+ {
860
+ "epoch": 0.5387055484445433,
861
+ "grad_norm": 152.875,
862
+ "learning_rate": 1.5386138613861388e-06,
863
+ "loss": 96.716,
864
+ "step": 605
865
+ },
866
+ {
867
+ "epoch": 0.5431576604151594,
868
+ "grad_norm": 151.75,
869
+ "learning_rate": 1.5237623762376239e-06,
870
+ "loss": 96.9523,
871
+ "step": 610
872
+ },
873
+ {
874
+ "epoch": 0.5476097723857755,
875
+ "grad_norm": 149.0,
876
+ "learning_rate": 1.508910891089109e-06,
877
+ "loss": 96.1208,
878
+ "step": 615
879
+ },
880
+ {
881
+ "epoch": 0.5520618843563916,
882
+ "grad_norm": 151.0,
883
+ "learning_rate": 1.4940594059405941e-06,
884
+ "loss": 96.7012,
885
+ "step": 620
886
+ },
887
+ {
888
+ "epoch": 0.5565139963270076,
889
+ "grad_norm": 148.75,
890
+ "learning_rate": 1.4792079207920792e-06,
891
+ "loss": 95.9622,
892
+ "step": 625
893
+ },
894
+ {
895
+ "epoch": 0.5609661082976237,
896
+ "grad_norm": 148.0,
897
+ "learning_rate": 1.4643564356435644e-06,
898
+ "loss": 96.7668,
899
+ "step": 630
900
+ },
901
+ {
902
+ "epoch": 0.5654182202682397,
903
+ "grad_norm": 149.0,
904
+ "learning_rate": 1.4495049504950495e-06,
905
+ "loss": 95.4153,
906
+ "step": 635
907
+ },
908
+ {
909
+ "epoch": 0.5698703322388559,
910
+ "grad_norm": 149.375,
911
+ "learning_rate": 1.4346534653465346e-06,
912
+ "loss": 95.8311,
913
+ "step": 640
914
+ },
915
+ {
916
+ "epoch": 0.5743224442094719,
917
+ "grad_norm": 147.25,
918
+ "learning_rate": 1.4198019801980197e-06,
919
+ "loss": 97.1089,
920
+ "step": 645
921
+ },
922
+ {
923
+ "epoch": 0.5787745561800879,
924
+ "grad_norm": 149.75,
925
+ "learning_rate": 1.404950495049505e-06,
926
+ "loss": 96.5063,
927
+ "step": 650
928
+ },
929
+ {
930
+ "epoch": 0.583226668150704,
931
+ "grad_norm": 151.0,
932
+ "learning_rate": 1.3900990099009902e-06,
933
+ "loss": 97.1067,
934
+ "step": 655
935
+ },
936
+ {
937
+ "epoch": 0.58767878012132,
938
+ "grad_norm": 157.625,
939
+ "learning_rate": 1.3752475247524753e-06,
940
+ "loss": 96.2562,
941
+ "step": 660
942
+ },
943
+ {
944
+ "epoch": 0.5921308920919361,
945
+ "grad_norm": 152.375,
946
+ "learning_rate": 1.3603960396039604e-06,
947
+ "loss": 95.2903,
948
+ "step": 665
949
+ },
950
+ {
951
+ "epoch": 0.5965830040625522,
952
+ "grad_norm": 151.625,
953
+ "learning_rate": 1.3455445544554455e-06,
954
+ "loss": 95.4937,
955
+ "step": 670
956
+ },
957
+ {
958
+ "epoch": 0.6010351160331683,
959
+ "grad_norm": 151.625,
960
+ "learning_rate": 1.3306930693069308e-06,
961
+ "loss": 95.8772,
962
+ "step": 675
963
+ },
964
+ {
965
+ "epoch": 0.6054872280037843,
966
+ "grad_norm": 154.625,
967
+ "learning_rate": 1.315841584158416e-06,
968
+ "loss": 96.8501,
969
+ "step": 680
970
+ },
971
+ {
972
+ "epoch": 0.6099393399744003,
973
+ "grad_norm": 151.5,
974
+ "learning_rate": 1.300990099009901e-06,
975
+ "loss": 96.9895,
976
+ "step": 685
977
+ },
978
+ {
979
+ "epoch": 0.6143914519450164,
980
+ "grad_norm": 148.875,
981
+ "learning_rate": 1.2861386138613862e-06,
982
+ "loss": 94.545,
983
+ "step": 690
984
+ },
985
+ {
986
+ "epoch": 0.6188435639156324,
987
+ "grad_norm": 152.75,
988
+ "learning_rate": 1.2712871287128713e-06,
989
+ "loss": 95.3633,
990
+ "step": 695
991
+ },
992
+ {
993
+ "epoch": 0.6232956758862486,
994
+ "grad_norm": 153.875,
995
+ "learning_rate": 1.2564356435643564e-06,
996
+ "loss": 94.8764,
997
+ "step": 700
998
+ },
999
+ {
1000
+ "epoch": 0.6277477878568646,
1001
+ "grad_norm": 151.375,
1002
+ "learning_rate": 1.2415841584158415e-06,
1003
+ "loss": 94.5789,
1004
+ "step": 705
1005
+ },
1006
+ {
1007
+ "epoch": 0.6321998998274807,
1008
+ "grad_norm": 145.75,
1009
+ "learning_rate": 1.2267326732673267e-06,
1010
+ "loss": 94.5253,
1011
+ "step": 710
1012
+ },
1013
+ {
1014
+ "epoch": 0.6366520117980967,
1015
+ "grad_norm": 151.25,
1016
+ "learning_rate": 1.211881188118812e-06,
1017
+ "loss": 95.8074,
1018
+ "step": 715
1019
+ },
1020
+ {
1021
+ "epoch": 0.6411041237687127,
1022
+ "grad_norm": 152.875,
1023
+ "learning_rate": 1.197029702970297e-06,
1024
+ "loss": 94.3003,
1025
+ "step": 720
1026
+ },
1027
+ {
1028
+ "epoch": 0.6455562357393289,
1029
+ "grad_norm": 149.75,
1030
+ "learning_rate": 1.1821782178217822e-06,
1031
+ "loss": 93.6894,
1032
+ "step": 725
1033
+ },
1034
+ {
1035
+ "epoch": 0.6500083477099449,
1036
+ "grad_norm": 160.5,
1037
+ "learning_rate": 1.1673267326732673e-06,
1038
+ "loss": 93.642,
1039
+ "step": 730
1040
+ },
1041
+ {
1042
+ "epoch": 0.654460459680561,
1043
+ "grad_norm": 149.125,
1044
+ "learning_rate": 1.1524752475247524e-06,
1045
+ "loss": 95.5531,
1046
+ "step": 735
1047
+ },
1048
+ {
1049
+ "epoch": 0.658912571651177,
1050
+ "grad_norm": 155.625,
1051
+ "learning_rate": 1.1376237623762376e-06,
1052
+ "loss": 95.4458,
1053
+ "step": 740
1054
+ },
1055
+ {
1056
+ "epoch": 0.6633646836217931,
1057
+ "grad_norm": 146.5,
1058
+ "learning_rate": 1.1227722772277229e-06,
1059
+ "loss": 95.5642,
1060
+ "step": 745
1061
+ },
1062
+ {
1063
+ "epoch": 0.6678167955924091,
1064
+ "grad_norm": 150.875,
1065
+ "learning_rate": 1.107920792079208e-06,
1066
+ "loss": 91.5858,
1067
+ "step": 750
1068
+ },
1069
+ {
1070
+ "epoch": 0.6722689075630253,
1071
+ "grad_norm": 159.875,
1072
+ "learning_rate": 1.0930693069306931e-06,
1073
+ "loss": 95.2966,
1074
+ "step": 755
1075
+ },
1076
+ {
1077
+ "epoch": 0.6767210195336413,
1078
+ "grad_norm": 147.5,
1079
+ "learning_rate": 1.0782178217821782e-06,
1080
+ "loss": 93.9319,
1081
+ "step": 760
1082
+ },
1083
+ {
1084
+ "epoch": 0.6811731315042573,
1085
+ "grad_norm": 151.25,
1086
+ "learning_rate": 1.0633663366336634e-06,
1087
+ "loss": 96.4605,
1088
+ "step": 765
1089
+ },
1090
+ {
1091
+ "epoch": 0.6856252434748734,
1092
+ "grad_norm": 150.75,
1093
+ "learning_rate": 1.0485148514851485e-06,
1094
+ "loss": 95.4815,
1095
+ "step": 770
1096
+ },
1097
+ {
1098
+ "epoch": 0.6900773554454894,
1099
+ "grad_norm": 152.625,
1100
+ "learning_rate": 1.0336633663366336e-06,
1101
+ "loss": 95.7014,
1102
+ "step": 775
1103
+ },
1104
+ {
1105
+ "epoch": 0.6945294674161056,
1106
+ "grad_norm": 150.0,
1107
+ "learning_rate": 1.018811881188119e-06,
1108
+ "loss": 95.7446,
1109
+ "step": 780
1110
+ },
1111
+ {
1112
+ "epoch": 0.6989815793867216,
1113
+ "grad_norm": 149.75,
1114
+ "learning_rate": 1.003960396039604e-06,
1115
+ "loss": 91.6439,
1116
+ "step": 785
1117
+ },
1118
+ {
1119
+ "epoch": 0.7034336913573377,
1120
+ "grad_norm": 151.0,
1121
+ "learning_rate": 9.891089108910892e-07,
1122
+ "loss": 94.8783,
1123
+ "step": 790
1124
+ },
1125
+ {
1126
+ "epoch": 0.7078858033279537,
1127
+ "grad_norm": 147.875,
1128
+ "learning_rate": 9.742574257425743e-07,
1129
+ "loss": 94.2932,
1130
+ "step": 795
1131
+ },
1132
+ {
1133
+ "epoch": 0.7123379152985697,
1134
+ "grad_norm": 152.5,
1135
+ "learning_rate": 9.594059405940594e-07,
1136
+ "loss": 94.3391,
1137
+ "step": 800
1138
+ },
1139
+ {
1140
+ "epoch": 0.7167900272691858,
1141
+ "grad_norm": 151.625,
1142
+ "learning_rate": 9.445544554455446e-07,
1143
+ "loss": 93.1912,
1144
+ "step": 805
1145
+ },
1146
+ {
1147
+ "epoch": 0.7212421392398018,
1148
+ "grad_norm": 150.5,
1149
+ "learning_rate": 9.297029702970297e-07,
1150
+ "loss": 94.2237,
1151
+ "step": 810
1152
+ },
1153
+ {
1154
+ "epoch": 0.725694251210418,
1155
+ "grad_norm": 146.25,
1156
+ "learning_rate": 9.148514851485148e-07,
1157
+ "loss": 93.2389,
1158
+ "step": 815
1159
+ },
1160
+ {
1161
+ "epoch": 0.730146363181034,
1162
+ "grad_norm": 149.875,
1163
+ "learning_rate": 9e-07,
1164
+ "loss": 94.0122,
1165
+ "step": 820
1166
+ },
1167
+ {
1168
+ "epoch": 0.7345984751516501,
1169
+ "grad_norm": 148.375,
1170
+ "learning_rate": 8.851485148514851e-07,
1171
+ "loss": 94.1428,
1172
+ "step": 825
1173
+ },
1174
+ {
1175
+ "epoch": 0.7390505871222661,
1176
+ "grad_norm": 146.5,
1177
+ "learning_rate": 8.702970297029703e-07,
1178
+ "loss": 92.5999,
1179
+ "step": 830
1180
+ },
1181
+ {
1182
+ "epoch": 0.7435026990928821,
1183
+ "grad_norm": 158.375,
1184
+ "learning_rate": 8.554455445544554e-07,
1185
+ "loss": 93.4235,
1186
+ "step": 835
1187
+ },
1188
+ {
1189
+ "epoch": 0.7479548110634983,
1190
+ "grad_norm": 143.25,
1191
+ "learning_rate": 8.405940594059407e-07,
1192
+ "loss": 94.7491,
1193
+ "step": 840
1194
+ },
1195
+ {
1196
+ "epoch": 0.7524069230341143,
1197
+ "grad_norm": 149.375,
1198
+ "learning_rate": 8.257425742574259e-07,
1199
+ "loss": 93.3154,
1200
+ "step": 845
1201
+ },
1202
+ {
1203
+ "epoch": 0.7568590350047304,
1204
+ "grad_norm": 153.25,
1205
+ "learning_rate": 8.10891089108911e-07,
1206
+ "loss": 91.7949,
1207
+ "step": 850
1208
+ },
1209
+ {
1210
+ "epoch": 0.7613111469753464,
1211
+ "grad_norm": 151.375,
1212
+ "learning_rate": 7.960396039603961e-07,
1213
+ "loss": 91.7284,
1214
+ "step": 855
1215
+ },
1216
+ {
1217
+ "epoch": 0.7657632589459625,
1218
+ "grad_norm": 152.125,
1219
+ "learning_rate": 7.811881188118812e-07,
1220
+ "loss": 93.6502,
1221
+ "step": 860
1222
+ },
1223
+ {
1224
+ "epoch": 0.7702153709165785,
1225
+ "grad_norm": 151.125,
1226
+ "learning_rate": 7.663366336633663e-07,
1227
+ "loss": 94.1399,
1228
+ "step": 865
1229
+ },
1230
+ {
1231
+ "epoch": 0.7746674828871946,
1232
+ "grad_norm": 151.0,
1233
+ "learning_rate": 7.514851485148515e-07,
1234
+ "loss": 92.5499,
1235
+ "step": 870
1236
+ },
1237
+ {
1238
+ "epoch": 0.7791195948578107,
1239
+ "grad_norm": 150.875,
1240
+ "learning_rate": 7.366336633663367e-07,
1241
+ "loss": 93.4188,
1242
+ "step": 875
1243
+ },
1244
+ {
1245
+ "epoch": 0.7835717068284267,
1246
+ "grad_norm": 159.5,
1247
+ "learning_rate": 7.217821782178218e-07,
1248
+ "loss": 90.1312,
1249
+ "step": 880
1250
+ },
1251
+ {
1252
+ "epoch": 0.7880238187990428,
1253
+ "grad_norm": 144.25,
1254
+ "learning_rate": 7.069306930693069e-07,
1255
+ "loss": 91.4897,
1256
+ "step": 885
1257
+ },
1258
+ {
1259
+ "epoch": 0.7924759307696588,
1260
+ "grad_norm": 147.875,
1261
+ "learning_rate": 6.920792079207921e-07,
1262
+ "loss": 93.3765,
1263
+ "step": 890
1264
+ },
1265
+ {
1266
+ "epoch": 0.796928042740275,
1267
+ "grad_norm": 155.0,
1268
+ "learning_rate": 6.772277227722772e-07,
1269
+ "loss": 93.9429,
1270
+ "step": 895
1271
+ },
1272
+ {
1273
+ "epoch": 0.801380154710891,
1274
+ "grad_norm": 145.5,
1275
+ "learning_rate": 6.623762376237624e-07,
1276
+ "loss": 91.7795,
1277
+ "step": 900
1278
+ },
1279
+ {
1280
+ "epoch": 0.8058322666815071,
1281
+ "grad_norm": 148.375,
1282
+ "learning_rate": 6.475247524752476e-07,
1283
+ "loss": 92.5831,
1284
+ "step": 905
1285
+ },
1286
+ {
1287
+ "epoch": 0.8102843786521231,
1288
+ "grad_norm": 152.5,
1289
+ "learning_rate": 6.326732673267327e-07,
1290
+ "loss": 92.3641,
1291
+ "step": 910
1292
+ },
1293
+ {
1294
+ "epoch": 0.8147364906227391,
1295
+ "grad_norm": 148.75,
1296
+ "learning_rate": 6.178217821782178e-07,
1297
+ "loss": 91.3071,
1298
+ "step": 915
1299
+ },
1300
+ {
1301
+ "epoch": 0.8191886025933552,
1302
+ "grad_norm": 152.375,
1303
+ "learning_rate": 6.02970297029703e-07,
1304
+ "loss": 90.2066,
1305
+ "step": 920
1306
+ },
1307
+ {
1308
+ "epoch": 0.8236407145639713,
1309
+ "grad_norm": 148.75,
1310
+ "learning_rate": 5.881188118811882e-07,
1311
+ "loss": 91.9865,
1312
+ "step": 925
1313
+ },
1314
+ {
1315
+ "epoch": 0.8280928265345874,
1316
+ "grad_norm": 145.375,
1317
+ "learning_rate": 5.732673267326733e-07,
1318
+ "loss": 90.7357,
1319
+ "step": 930
1320
+ },
1321
+ {
1322
+ "epoch": 0.8325449385052034,
1323
+ "grad_norm": 153.5,
1324
+ "learning_rate": 5.584158415841584e-07,
1325
+ "loss": 94.0521,
1326
+ "step": 935
1327
+ },
1328
+ {
1329
+ "epoch": 0.8369970504758195,
1330
+ "grad_norm": 147.625,
1331
+ "learning_rate": 5.435643564356436e-07,
1332
+ "loss": 92.177,
1333
+ "step": 940
1334
+ },
1335
+ {
1336
+ "epoch": 0.8414491624464355,
1337
+ "grad_norm": 144.75,
1338
+ "learning_rate": 5.287128712871287e-07,
1339
+ "loss": 93.7151,
1340
+ "step": 945
1341
+ },
1342
+ {
1343
+ "epoch": 0.8459012744170515,
1344
+ "grad_norm": 145.375,
1345
+ "learning_rate": 5.138613861386139e-07,
1346
+ "loss": 90.6827,
1347
+ "step": 950
1348
+ },
1349
+ {
1350
+ "epoch": 0.8503533863876677,
1351
+ "grad_norm": 149.625,
1352
+ "learning_rate": 4.990099009900991e-07,
1353
+ "loss": 90.1044,
1354
+ "step": 955
1355
+ },
1356
+ {
1357
+ "epoch": 0.8548054983582837,
1358
+ "grad_norm": 147.5,
1359
+ "learning_rate": 4.841584158415842e-07,
1360
+ "loss": 90.5797,
1361
+ "step": 960
1362
+ },
1363
+ {
1364
+ "epoch": 0.8592576103288998,
1365
+ "grad_norm": 150.75,
1366
+ "learning_rate": 4.693069306930693e-07,
1367
+ "loss": 90.0814,
1368
+ "step": 965
1369
+ },
1370
+ {
1371
+ "epoch": 0.8637097222995158,
1372
+ "grad_norm": 152.5,
1373
+ "learning_rate": 4.5445544554455447e-07,
1374
+ "loss": 91.7237,
1375
+ "step": 970
1376
+ },
1377
+ {
1378
+ "epoch": 0.8681618342701319,
1379
+ "grad_norm": 152.625,
1380
+ "learning_rate": 4.396039603960396e-07,
1381
+ "loss": 91.0753,
1382
+ "step": 975
1383
+ },
1384
+ {
1385
+ "epoch": 0.872613946240748,
1386
+ "grad_norm": 152.0,
1387
+ "learning_rate": 4.247524752475247e-07,
1388
+ "loss": 91.9936,
1389
+ "step": 980
1390
+ },
1391
+ {
1392
+ "epoch": 0.877066058211364,
1393
+ "grad_norm": 143.5,
1394
+ "learning_rate": 4.099009900990099e-07,
1395
+ "loss": 91.7998,
1396
+ "step": 985
1397
+ },
1398
+ {
1399
+ "epoch": 0.8815181701819801,
1400
+ "grad_norm": 146.25,
1401
+ "learning_rate": 3.950495049504951e-07,
1402
+ "loss": 91.8213,
1403
+ "step": 990
1404
+ },
1405
+ {
1406
+ "epoch": 0.8859702821525961,
1407
+ "grad_norm": 153.375,
1408
+ "learning_rate": 3.801980198019802e-07,
1409
+ "loss": 90.3063,
1410
+ "step": 995
1411
+ },
1412
+ {
1413
+ "epoch": 0.8904223941232122,
1414
+ "grad_norm": 145.25,
1415
+ "learning_rate": 3.653465346534653e-07,
1416
+ "loss": 91.2036,
1417
+ "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 0.8904223941232122,
1421
+ "eval_loss": NaN,
1422
+ "eval_runtime": 101.482,
1423
+ "eval_samples_per_second": 596.431,
1424
+ "eval_steps_per_second": 37.278,
1425
+ "step": 1000
1426
  }
1427
  ],
1428
  "logging_steps": 5,
 
1442
  "attributes": {}
1443
  }
1444
  },
1445
+ "total_flos": 4.332357992788787e+18,
1446
  "train_batch_size": 4,
1447
  "trial_name": null,
1448
  "trial_params": null