CocoRoF commited on
Commit
6962b32
·
verified ·
1 Parent(s): b8bcf4b

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37082c4336aec2b59f43b05c2e88b43b8538ca424daa6eedc6de2c64eb10399e
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:617ff76d496774425760760c864d36a829690e93c00f300e5fd6a772bec23af2
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eed6f19e0e5fba4e01e5db6f3883fb75284489bbd2f9466b81466ca19049d256
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdd691842c9b3eecd0100adcd2271f6b8b162e162a40b960523591fe8491a784
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78d3f197f6c6558fa8056324f1563ab9e957255f5a1a959362aa4eed7a9545db
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c1a9c65c2869356282cad6b4a0f7dff7f4dd68ab3d9d216c72b7d6cb524f860
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:896febe768e17bae5022a95960c041f6425783774ec8859d99d3b149063b1bf9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eac482d57e966585467c8ef44dae2869bf7e5d92886f69c11ed7bccc34c07efe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1f27d227a20dc320ac283e0938fb2f6e5b475829a583f8c44d1a16a8c828307
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05a7106aaeaec4b81704e3f4a998b5123cf9342a6733bd9fd2d578e99108c3b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94120d8d88502ec8d8b623ec7550315caca003b44fcffbb5767ab0de91baefe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:332e4d901be380f740b5d8578f7b80ef1865c7fba83bc288c8a35852205cc668
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec07cf9c89b28b68684960473502733dee8ba44197f08f08f50d0f2bd0d4d16a
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:079541ad505a3dc0e80239afe57cfe11e44b1de1d72b78fa14ede3d018356e24
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.11558772747303556,
5
  "eval_steps": 500,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -715,6 +715,714 @@
715
  "eval_samples_per_second": 606.217,
716
  "eval_steps_per_second": 37.889,
717
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 5,
@@ -734,7 +1442,7 @@
734
  "attributes": {}
735
  }
736
  },
737
- "total_flos": 2.1661789963943936e+18,
738
  "train_batch_size": 4,
739
  "trial_name": null,
740
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2311754549460711,
5
  "eval_steps": 500,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
715
  "eval_samples_per_second": 606.217,
716
  "eval_steps_per_second": 37.889,
717
  "step": 500
718
+ },
719
+ {
720
+ "epoch": 0.1167436047477659,
721
+ "grad_norm": 178.25,
722
+ "learning_rate": 9.815005138746147e-06,
723
+ "loss": 73.0971,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 0.11789948202249625,
728
+ "grad_norm": 176.125,
729
+ "learning_rate": 9.802158273381296e-06,
730
+ "loss": 73.0252,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 0.11905535929722662,
735
+ "grad_norm": 204.625,
736
+ "learning_rate": 9.789311408016445e-06,
737
+ "loss": 74.122,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 0.12021123657195697,
742
+ "grad_norm": 175.25,
743
+ "learning_rate": 9.776464542651593e-06,
744
+ "loss": 75.304,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 0.12136711384668733,
749
+ "grad_norm": 228.125,
750
+ "learning_rate": 9.763617677286742e-06,
751
+ "loss": 80.1305,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 0.12252299112141768,
756
+ "grad_norm": 351.25,
757
+ "learning_rate": 9.750770811921891e-06,
758
+ "loss": 98.8667,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 0.12367886839614804,
763
+ "grad_norm": 253.75,
764
+ "learning_rate": 9.73792394655704e-06,
765
+ "loss": 90.5137,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 0.12483474567087839,
770
+ "grad_norm": 204.0,
771
+ "learning_rate": 9.72507708119219e-06,
772
+ "loss": 85.6656,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 0.12599062294560875,
777
+ "grad_norm": 164.875,
778
+ "learning_rate": 9.712230215827338e-06,
779
+ "loss": 81.4311,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 0.1271465002203391,
784
+ "grad_norm": 171.75,
785
+ "learning_rate": 9.699383350462488e-06,
786
+ "loss": 78.962,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 0.12830237749506945,
791
+ "grad_norm": 181.375,
792
+ "learning_rate": 9.686536485097637e-06,
793
+ "loss": 78.2494,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 0.12945825476979983,
798
+ "grad_norm": 167.125,
799
+ "learning_rate": 9.673689619732786e-06,
800
+ "loss": 76.5218,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 0.13061413204453018,
805
+ "grad_norm": 174.625,
806
+ "learning_rate": 9.660842754367935e-06,
807
+ "loss": 76.3271,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 0.13177000931926053,
812
+ "grad_norm": 163.125,
813
+ "learning_rate": 9.647995889003084e-06,
814
+ "loss": 77.2531,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 0.13292588659399088,
819
+ "grad_norm": 185.875,
820
+ "learning_rate": 9.635149023638232e-06,
821
+ "loss": 76.3901,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 0.13408176386872123,
826
+ "grad_norm": 160.5,
827
+ "learning_rate": 9.622302158273383e-06,
828
+ "loss": 76.1426,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 0.1352376411434516,
833
+ "grad_norm": 170.75,
834
+ "learning_rate": 9.609455292908532e-06,
835
+ "loss": 75.3614,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 0.13639351841818195,
840
+ "grad_norm": 186.125,
841
+ "learning_rate": 9.59660842754368e-06,
842
+ "loss": 75.3318,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 0.1375493956929123,
847
+ "grad_norm": 176.5,
848
+ "learning_rate": 9.58376156217883e-06,
849
+ "loss": 73.9041,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 0.13870527296764265,
854
+ "grad_norm": 170.25,
855
+ "learning_rate": 9.570914696813978e-06,
856
+ "loss": 74.9994,
857
+ "step": 600
858
+ },
859
+ {
860
+ "epoch": 0.13986115024237303,
861
+ "grad_norm": 222.625,
862
+ "learning_rate": 9.558067831449127e-06,
863
+ "loss": 76.4041,
864
+ "step": 605
865
+ },
866
+ {
867
+ "epoch": 0.14101702751710338,
868
+ "grad_norm": 174.625,
869
+ "learning_rate": 9.545220966084276e-06,
870
+ "loss": 76.0331,
871
+ "step": 610
872
+ },
873
+ {
874
+ "epoch": 0.14217290479183373,
875
+ "grad_norm": 169.875,
876
+ "learning_rate": 9.532374100719425e-06,
877
+ "loss": 74.6435,
878
+ "step": 615
879
+ },
880
+ {
881
+ "epoch": 0.14332878206656408,
882
+ "grad_norm": 188.75,
883
+ "learning_rate": 9.519527235354574e-06,
884
+ "loss": 76.2372,
885
+ "step": 620
886
+ },
887
+ {
888
+ "epoch": 0.14448465934129442,
889
+ "grad_norm": 164.25,
890
+ "learning_rate": 9.506680369989724e-06,
891
+ "loss": 75.1062,
892
+ "step": 625
893
+ },
894
+ {
895
+ "epoch": 0.1456405366160248,
896
+ "grad_norm": 165.125,
897
+ "learning_rate": 9.493833504624871e-06,
898
+ "loss": 75.9294,
899
+ "step": 630
900
+ },
901
+ {
902
+ "epoch": 0.14679641389075515,
903
+ "grad_norm": 189.75,
904
+ "learning_rate": 9.480986639260022e-06,
905
+ "loss": 75.5001,
906
+ "step": 635
907
+ },
908
+ {
909
+ "epoch": 0.1479522911654855,
910
+ "grad_norm": 228.5,
911
+ "learning_rate": 9.468139773895171e-06,
912
+ "loss": 74.1863,
913
+ "step": 640
914
+ },
915
+ {
916
+ "epoch": 0.14910816844021585,
917
+ "grad_norm": 222.0,
918
+ "learning_rate": 9.45529290853032e-06,
919
+ "loss": 76.5168,
920
+ "step": 645
921
+ },
922
+ {
923
+ "epoch": 0.15026404571494623,
924
+ "grad_norm": 187.875,
925
+ "learning_rate": 9.442446043165469e-06,
926
+ "loss": 74.2555,
927
+ "step": 650
928
+ },
929
+ {
930
+ "epoch": 0.15141992298967658,
931
+ "grad_norm": 159.0,
932
+ "learning_rate": 9.429599177800617e-06,
933
+ "loss": 75.0447,
934
+ "step": 655
935
+ },
936
+ {
937
+ "epoch": 0.15257580026440692,
938
+ "grad_norm": 158.75,
939
+ "learning_rate": 9.416752312435766e-06,
940
+ "loss": 73.3359,
941
+ "step": 660
942
+ },
943
+ {
944
+ "epoch": 0.15373167753913727,
945
+ "grad_norm": 170.625,
946
+ "learning_rate": 9.403905447070915e-06,
947
+ "loss": 73.3929,
948
+ "step": 665
949
+ },
950
+ {
951
+ "epoch": 0.15488755481386765,
952
+ "grad_norm": 202.5,
953
+ "learning_rate": 9.391058581706064e-06,
954
+ "loss": 75.0084,
955
+ "step": 670
956
+ },
957
+ {
958
+ "epoch": 0.156043432088598,
959
+ "grad_norm": 194.625,
960
+ "learning_rate": 9.378211716341213e-06,
961
+ "loss": 74.5239,
962
+ "step": 675
963
+ },
964
+ {
965
+ "epoch": 0.15719930936332835,
966
+ "grad_norm": 178.375,
967
+ "learning_rate": 9.365364850976363e-06,
968
+ "loss": 75.5832,
969
+ "step": 680
970
+ },
971
+ {
972
+ "epoch": 0.1583551866380587,
973
+ "grad_norm": 162.75,
974
+ "learning_rate": 9.35251798561151e-06,
975
+ "loss": 74.5449,
976
+ "step": 685
977
+ },
978
+ {
979
+ "epoch": 0.15951106391278905,
980
+ "grad_norm": 184.75,
981
+ "learning_rate": 9.339671120246661e-06,
982
+ "loss": 74.4716,
983
+ "step": 690
984
+ },
985
+ {
986
+ "epoch": 0.16066694118751942,
987
+ "grad_norm": 175.0,
988
+ "learning_rate": 9.32682425488181e-06,
989
+ "loss": 75.2684,
990
+ "step": 695
991
+ },
992
+ {
993
+ "epoch": 0.16182281846224977,
994
+ "grad_norm": 173.75,
995
+ "learning_rate": 9.313977389516959e-06,
996
+ "loss": 73.7116,
997
+ "step": 700
998
+ },
999
+ {
1000
+ "epoch": 0.16297869573698012,
1001
+ "grad_norm": 165.375,
1002
+ "learning_rate": 9.301130524152108e-06,
1003
+ "loss": 73.4956,
1004
+ "step": 705
1005
+ },
1006
+ {
1007
+ "epoch": 0.16413457301171047,
1008
+ "grad_norm": 175.875,
1009
+ "learning_rate": 9.288283658787256e-06,
1010
+ "loss": 73.5028,
1011
+ "step": 710
1012
+ },
1013
+ {
1014
+ "epoch": 0.16529045028644085,
1015
+ "grad_norm": 171.375,
1016
+ "learning_rate": 9.275436793422405e-06,
1017
+ "loss": 73.6011,
1018
+ "step": 715
1019
+ },
1020
+ {
1021
+ "epoch": 0.1664463275611712,
1022
+ "grad_norm": 175.75,
1023
+ "learning_rate": 9.262589928057554e-06,
1024
+ "loss": 74.965,
1025
+ "step": 720
1026
+ },
1027
+ {
1028
+ "epoch": 0.16760220483590155,
1029
+ "grad_norm": 160.625,
1030
+ "learning_rate": 9.249743062692705e-06,
1031
+ "loss": 74.7802,
1032
+ "step": 725
1033
+ },
1034
+ {
1035
+ "epoch": 0.1687580821106319,
1036
+ "grad_norm": 163.875,
1037
+ "learning_rate": 9.236896197327852e-06,
1038
+ "loss": 74.4619,
1039
+ "step": 730
1040
+ },
1041
+ {
1042
+ "epoch": 0.16991395938536225,
1043
+ "grad_norm": 173.0,
1044
+ "learning_rate": 9.224049331963002e-06,
1045
+ "loss": 74.9448,
1046
+ "step": 735
1047
+ },
1048
+ {
1049
+ "epoch": 0.17106983666009262,
1050
+ "grad_norm": 193.375,
1051
+ "learning_rate": 9.21120246659815e-06,
1052
+ "loss": 74.6345,
1053
+ "step": 740
1054
+ },
1055
+ {
1056
+ "epoch": 0.17222571393482297,
1057
+ "grad_norm": 168.25,
1058
+ "learning_rate": 9.1983556012333e-06,
1059
+ "loss": 74.5997,
1060
+ "step": 745
1061
+ },
1062
+ {
1063
+ "epoch": 0.17338159120955332,
1064
+ "grad_norm": 181.375,
1065
+ "learning_rate": 9.185508735868449e-06,
1066
+ "loss": 72.9167,
1067
+ "step": 750
1068
+ },
1069
+ {
1070
+ "epoch": 0.17453746848428367,
1071
+ "grad_norm": 209.875,
1072
+ "learning_rate": 9.172661870503598e-06,
1073
+ "loss": 75.2782,
1074
+ "step": 755
1075
+ },
1076
+ {
1077
+ "epoch": 0.17569334575901405,
1078
+ "grad_norm": 165.25,
1079
+ "learning_rate": 9.159815005138747e-06,
1080
+ "loss": 73.2717,
1081
+ "step": 760
1082
+ },
1083
+ {
1084
+ "epoch": 0.1768492230337444,
1085
+ "grad_norm": 168.75,
1086
+ "learning_rate": 9.146968139773897e-06,
1087
+ "loss": 75.1429,
1088
+ "step": 765
1089
+ },
1090
+ {
1091
+ "epoch": 0.17800510030847475,
1092
+ "grad_norm": 161.0,
1093
+ "learning_rate": 9.134121274409044e-06,
1094
+ "loss": 74.8651,
1095
+ "step": 770
1096
+ },
1097
+ {
1098
+ "epoch": 0.1791609775832051,
1099
+ "grad_norm": 186.375,
1100
+ "learning_rate": 9.121274409044195e-06,
1101
+ "loss": 75.679,
1102
+ "step": 775
1103
+ },
1104
+ {
1105
+ "epoch": 0.18031685485793547,
1106
+ "grad_norm": 183.875,
1107
+ "learning_rate": 9.108427543679344e-06,
1108
+ "loss": 74.1174,
1109
+ "step": 780
1110
+ },
1111
+ {
1112
+ "epoch": 0.18147273213266582,
1113
+ "grad_norm": 160.0,
1114
+ "learning_rate": 9.095580678314493e-06,
1115
+ "loss": 74.8214,
1116
+ "step": 785
1117
+ },
1118
+ {
1119
+ "epoch": 0.18262860940739617,
1120
+ "grad_norm": 173.625,
1121
+ "learning_rate": 9.082733812949641e-06,
1122
+ "loss": 72.8824,
1123
+ "step": 790
1124
+ },
1125
+ {
1126
+ "epoch": 0.18378448668212652,
1127
+ "grad_norm": 184.75,
1128
+ "learning_rate": 9.06988694758479e-06,
1129
+ "loss": 75.2577,
1130
+ "step": 795
1131
+ },
1132
+ {
1133
+ "epoch": 0.18494036395685687,
1134
+ "grad_norm": 171.75,
1135
+ "learning_rate": 9.057040082219939e-06,
1136
+ "loss": 74.0234,
1137
+ "step": 800
1138
+ },
1139
+ {
1140
+ "epoch": 0.18609624123158724,
1141
+ "grad_norm": 166.125,
1142
+ "learning_rate": 9.044193216855088e-06,
1143
+ "loss": 74.5028,
1144
+ "step": 805
1145
+ },
1146
+ {
1147
+ "epoch": 0.1872521185063176,
1148
+ "grad_norm": 181.25,
1149
+ "learning_rate": 9.031346351490237e-06,
1150
+ "loss": 74.7906,
1151
+ "step": 810
1152
+ },
1153
+ {
1154
+ "epoch": 0.18840799578104794,
1155
+ "grad_norm": 168.25,
1156
+ "learning_rate": 9.018499486125386e-06,
1157
+ "loss": 73.6598,
1158
+ "step": 815
1159
+ },
1160
+ {
1161
+ "epoch": 0.1895638730557783,
1162
+ "grad_norm": 213.625,
1163
+ "learning_rate": 9.005652620760536e-06,
1164
+ "loss": 74.4191,
1165
+ "step": 820
1166
+ },
1167
+ {
1168
+ "epoch": 0.19071975033050867,
1169
+ "grad_norm": 184.5,
1170
+ "learning_rate": 8.992805755395683e-06,
1171
+ "loss": 74.0181,
1172
+ "step": 825
1173
+ },
1174
+ {
1175
+ "epoch": 0.19187562760523902,
1176
+ "grad_norm": 168.75,
1177
+ "learning_rate": 8.979958890030834e-06,
1178
+ "loss": 72.935,
1179
+ "step": 830
1180
+ },
1181
+ {
1182
+ "epoch": 0.19303150487996937,
1183
+ "grad_norm": 176.625,
1184
+ "learning_rate": 8.967112024665983e-06,
1185
+ "loss": 74.5831,
1186
+ "step": 835
1187
+ },
1188
+ {
1189
+ "epoch": 0.19418738215469972,
1190
+ "grad_norm": 168.625,
1191
+ "learning_rate": 8.954265159301132e-06,
1192
+ "loss": 75.029,
1193
+ "step": 840
1194
+ },
1195
+ {
1196
+ "epoch": 0.1953432594294301,
1197
+ "grad_norm": 188.0,
1198
+ "learning_rate": 8.94141829393628e-06,
1199
+ "loss": 75.1383,
1200
+ "step": 845
1201
+ },
1202
+ {
1203
+ "epoch": 0.19649913670416044,
1204
+ "grad_norm": 163.75,
1205
+ "learning_rate": 8.92857142857143e-06,
1206
+ "loss": 73.7028,
1207
+ "step": 850
1208
+ },
1209
+ {
1210
+ "epoch": 0.1976550139788908,
1211
+ "grad_norm": 168.75,
1212
+ "learning_rate": 8.915724563206578e-06,
1213
+ "loss": 73.8515,
1214
+ "step": 855
1215
+ },
1216
+ {
1217
+ "epoch": 0.19881089125362114,
1218
+ "grad_norm": 199.875,
1219
+ "learning_rate": 8.902877697841727e-06,
1220
+ "loss": 74.1701,
1221
+ "step": 860
1222
+ },
1223
+ {
1224
+ "epoch": 0.1999667685283515,
1225
+ "grad_norm": 157.5,
1226
+ "learning_rate": 8.890030832476876e-06,
1227
+ "loss": 75.171,
1228
+ "step": 865
1229
+ },
1230
+ {
1231
+ "epoch": 0.20112264580308187,
1232
+ "grad_norm": 189.875,
1233
+ "learning_rate": 8.877183967112025e-06,
1234
+ "loss": 73.9979,
1235
+ "step": 870
1236
+ },
1237
+ {
1238
+ "epoch": 0.20227852307781222,
1239
+ "grad_norm": 160.75,
1240
+ "learning_rate": 8.864337101747175e-06,
1241
+ "loss": 73.9971,
1242
+ "step": 875
1243
+ },
1244
+ {
1245
+ "epoch": 0.20343440035254257,
1246
+ "grad_norm": 170.75,
1247
+ "learning_rate": 8.851490236382322e-06,
1248
+ "loss": 73.9474,
1249
+ "step": 880
1250
+ },
1251
+ {
1252
+ "epoch": 0.20459027762727292,
1253
+ "grad_norm": 161.875,
1254
+ "learning_rate": 8.838643371017473e-06,
1255
+ "loss": 73.6239,
1256
+ "step": 885
1257
+ },
1258
+ {
1259
+ "epoch": 0.2057461549020033,
1260
+ "grad_norm": 208.25,
1261
+ "learning_rate": 8.825796505652622e-06,
1262
+ "loss": 74.3579,
1263
+ "step": 890
1264
+ },
1265
+ {
1266
+ "epoch": 0.20690203217673364,
1267
+ "grad_norm": 182.375,
1268
+ "learning_rate": 8.81294964028777e-06,
1269
+ "loss": 73.6577,
1270
+ "step": 895
1271
+ },
1272
+ {
1273
+ "epoch": 0.208057909451464,
1274
+ "grad_norm": 167.375,
1275
+ "learning_rate": 8.80010277492292e-06,
1276
+ "loss": 73.492,
1277
+ "step": 900
1278
+ },
1279
+ {
1280
+ "epoch": 0.20921378672619434,
1281
+ "grad_norm": 202.625,
1282
+ "learning_rate": 8.787255909558068e-06,
1283
+ "loss": 73.0436,
1284
+ "step": 905
1285
+ },
1286
+ {
1287
+ "epoch": 0.2103696640009247,
1288
+ "grad_norm": 156.5,
1289
+ "learning_rate": 8.774409044193217e-06,
1290
+ "loss": 74.3569,
1291
+ "step": 910
1292
+ },
1293
+ {
1294
+ "epoch": 0.21152554127565507,
1295
+ "grad_norm": 167.75,
1296
+ "learning_rate": 8.761562178828366e-06,
1297
+ "loss": 73.5566,
1298
+ "step": 915
1299
+ },
1300
+ {
1301
+ "epoch": 0.21268141855038541,
1302
+ "grad_norm": 159.0,
1303
+ "learning_rate": 8.748715313463516e-06,
1304
+ "loss": 73.4377,
1305
+ "step": 920
1306
+ },
1307
+ {
1308
+ "epoch": 0.21383729582511576,
1309
+ "grad_norm": 197.5,
1310
+ "learning_rate": 8.735868448098664e-06,
1311
+ "loss": 73.5647,
1312
+ "step": 925
1313
+ },
1314
+ {
1315
+ "epoch": 0.2149931730998461,
1316
+ "grad_norm": 157.75,
1317
+ "learning_rate": 8.723021582733814e-06,
1318
+ "loss": 74.3935,
1319
+ "step": 930
1320
+ },
1321
+ {
1322
+ "epoch": 0.2161490503745765,
1323
+ "grad_norm": 160.0,
1324
+ "learning_rate": 8.710174717368961e-06,
1325
+ "loss": 73.5235,
1326
+ "step": 935
1327
+ },
1328
+ {
1329
+ "epoch": 0.21730492764930684,
1330
+ "grad_norm": 193.5,
1331
+ "learning_rate": 8.697327852004112e-06,
1332
+ "loss": 74.8152,
1333
+ "step": 940
1334
+ },
1335
+ {
1336
+ "epoch": 0.2184608049240372,
1337
+ "grad_norm": 166.125,
1338
+ "learning_rate": 8.68448098663926e-06,
1339
+ "loss": 74.962,
1340
+ "step": 945
1341
+ },
1342
+ {
1343
+ "epoch": 0.21961668219876754,
1344
+ "grad_norm": 187.75,
1345
+ "learning_rate": 8.67163412127441e-06,
1346
+ "loss": 73.3269,
1347
+ "step": 950
1348
+ },
1349
+ {
1350
+ "epoch": 0.22077255947349791,
1351
+ "grad_norm": 177.625,
1352
+ "learning_rate": 8.658787255909558e-06,
1353
+ "loss": 72.6171,
1354
+ "step": 955
1355
+ },
1356
+ {
1357
+ "epoch": 0.22192843674822826,
1358
+ "grad_norm": 207.875,
1359
+ "learning_rate": 8.645940390544709e-06,
1360
+ "loss": 73.4562,
1361
+ "step": 960
1362
+ },
1363
+ {
1364
+ "epoch": 0.2230843140229586,
1365
+ "grad_norm": 197.0,
1366
+ "learning_rate": 8.633093525179856e-06,
1367
+ "loss": 72.9479,
1368
+ "step": 965
1369
+ },
1370
+ {
1371
+ "epoch": 0.22424019129768896,
1372
+ "grad_norm": 178.75,
1373
+ "learning_rate": 8.620246659815007e-06,
1374
+ "loss": 73.2069,
1375
+ "step": 970
1376
+ },
1377
+ {
1378
+ "epoch": 0.2253960685724193,
1379
+ "grad_norm": 184.375,
1380
+ "learning_rate": 8.607399794450156e-06,
1381
+ "loss": 72.4369,
1382
+ "step": 975
1383
+ },
1384
+ {
1385
+ "epoch": 0.2265519458471497,
1386
+ "grad_norm": 193.625,
1387
+ "learning_rate": 8.594552929085304e-06,
1388
+ "loss": 75.2301,
1389
+ "step": 980
1390
+ },
1391
+ {
1392
+ "epoch": 0.22770782312188004,
1393
+ "grad_norm": 160.75,
1394
+ "learning_rate": 8.581706063720453e-06,
1395
+ "loss": 74.4445,
1396
+ "step": 985
1397
+ },
1398
+ {
1399
+ "epoch": 0.2288637003966104,
1400
+ "grad_norm": 188.0,
1401
+ "learning_rate": 8.568859198355602e-06,
1402
+ "loss": 73.1566,
1403
+ "step": 990
1404
+ },
1405
+ {
1406
+ "epoch": 0.23001957767134074,
1407
+ "grad_norm": 166.0,
1408
+ "learning_rate": 8.556012332990751e-06,
1409
+ "loss": 72.9151,
1410
+ "step": 995
1411
+ },
1412
+ {
1413
+ "epoch": 0.2311754549460711,
1414
+ "grad_norm": 165.5,
1415
+ "learning_rate": 8.5431654676259e-06,
1416
+ "loss": 72.4977,
1417
+ "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 0.2311754549460711,
1421
+ "eval_loss": NaN,
1422
+ "eval_runtime": 382.2843,
1423
+ "eval_samples_per_second": 609.845,
1424
+ "eval_steps_per_second": 38.116,
1425
+ "step": 1000
1426
  }
1427
  ],
1428
  "logging_steps": 5,
 
1442
  "attributes": {}
1443
  }
1444
  },
1445
+ "total_flos": 4.332357992788787e+18,
1446
  "train_batch_size": 4,
1447
  "trial_name": null,
1448
  "trial_params": null