CocoRoF commited on
Commit
f64d5ab
·
verified ·
1 Parent(s): ccd7594

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb537bc58a461228b496e058380dfff3de3db1fc3f2945771e96c8310f621661
3
  size 737580392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f5ad69af16ab5281b26e97b01c5792b964c744a508c6bc172eddc193844ce26
3
  size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4202f05dcca1fa833f65010e10965eabf5ab866b8682de76c6f36e41f9427aba
3
  size 1475248442
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e599dee06fa72aa210fe2735308cf272ff913daca158dd4092f84df1fcce8fb7
3
  size 1475248442
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5cd4ad571350abe2eb98424dd5c5dd650f79de5be8de2b9ff4da9d030d723b
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcff98006be86afc3f75b37d6113fdf5b62db51c94b6f68b33f555f4ac346822
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f220fd74a6757e167d014f721e96b7e5710e8f5c97f48c9fe6d72e19ebbbd65c
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21565575b5db0aa139865ffb0d9df6ceb55078dc7b218f601419cc3d7b873134
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487a03a3b6c36091572b8fbb74add1eb3c753efe5ab0eee791c8d03f495e5c98
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f8c6e22cfd0b3668705becc42fb2c443ef5e4cfe38d4ba5e3dfdc565094143
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:379eebc7ccebea3c24281c6604242d09589a64d4774ea37b6d5cf6e7bbece645
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3668b553f323a1aa5806c5d8feff7c926f6116dc2b7f961e9746634c8e825c0
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc979d8a4a308942c202386cab3e9f600572bc65e2a425e5891132147d087023
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93895b5baeef3dfbe9ac5ba0209fcc94427355308cca9131e3f76f15e4750806
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.8753514526710404,
5
  "eval_steps": 250,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -771,6 +771,770 @@
771
  "eval_spearman_manhattan": 0.7621231744319878,
772
  "eval_steps_per_second": 6.85,
773
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
  }
775
  ],
776
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.751640112464855,
5
  "eval_steps": 250,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
771
  "eval_spearman_manhattan": 0.7621231744319878,
772
  "eval_steps_per_second": 6.85,
773
  "step": 1000
774
+ },
775
+ {
776
+ "epoch": 1.8940955951265228,
777
+ "grad_norm": 0.3324965834617615,
778
+ "learning_rate": 0.000799408396667106,
779
+ "loss": 0.1491,
780
+ "step": 1010
781
+ },
782
+ {
783
+ "epoch": 1.9128397375820057,
784
+ "grad_norm": 0.3112243711948395,
785
+ "learning_rate": 0.0007994025392083644,
786
+ "loss": 0.1622,
787
+ "step": 1020
788
+ },
789
+ {
790
+ "epoch": 1.9315838800374883,
791
+ "grad_norm": 0.3381972014904022,
792
+ "learning_rate": 0.000799396681749623,
793
+ "loss": 0.1462,
794
+ "step": 1030
795
+ },
796
+ {
797
+ "epoch": 1.9503280224929709,
798
+ "grad_norm": 0.3424859642982483,
799
+ "learning_rate": 0.0007993908242908814,
800
+ "loss": 0.1651,
801
+ "step": 1040
802
+ },
803
+ {
804
+ "epoch": 1.9690721649484537,
805
+ "grad_norm": 0.42161494493484497,
806
+ "learning_rate": 0.0007993849668321399,
807
+ "loss": 0.1521,
808
+ "step": 1050
809
+ },
810
+ {
811
+ "epoch": 1.9878163074039361,
812
+ "grad_norm": 0.3541307747364044,
813
+ "learning_rate": 0.0007993791093733984,
814
+ "loss": 0.162,
815
+ "step": 1060
816
+ },
817
+ {
818
+ "epoch": 2.007497656982193,
819
+ "grad_norm": 0.22963856160640717,
820
+ "learning_rate": 0.0007993732519146568,
821
+ "loss": 0.1297,
822
+ "step": 1070
823
+ },
824
+ {
825
+ "epoch": 2.026241799437676,
826
+ "grad_norm": 0.28242990374565125,
827
+ "learning_rate": 0.0007993673944559154,
828
+ "loss": 0.0773,
829
+ "step": 1080
830
+ },
831
+ {
832
+ "epoch": 2.044985941893158,
833
+ "grad_norm": 0.3516603112220764,
834
+ "learning_rate": 0.0007993615369971738,
835
+ "loss": 0.0799,
836
+ "step": 1090
837
+ },
838
+ {
839
+ "epoch": 2.063730084348641,
840
+ "grad_norm": 0.3558428883552551,
841
+ "learning_rate": 0.0007993556795384323,
842
+ "loss": 0.0885,
843
+ "step": 1100
844
+ },
845
+ {
846
+ "epoch": 2.082474226804124,
847
+ "grad_norm": 0.3211170732975006,
848
+ "learning_rate": 0.0007993498220796908,
849
+ "loss": 0.0825,
850
+ "step": 1110
851
+ },
852
+ {
853
+ "epoch": 2.1012183692596063,
854
+ "grad_norm": 0.20844395458698273,
855
+ "learning_rate": 0.0007993439646209492,
856
+ "loss": 0.0763,
857
+ "step": 1120
858
+ },
859
+ {
860
+ "epoch": 2.119962511715089,
861
+ "grad_norm": 0.3156029284000397,
862
+ "learning_rate": 0.0007993381071622077,
863
+ "loss": 0.0797,
864
+ "step": 1130
865
+ },
866
+ {
867
+ "epoch": 2.138706654170572,
868
+ "grad_norm": 0.3986193835735321,
869
+ "learning_rate": 0.0007993322497034662,
870
+ "loss": 0.0852,
871
+ "step": 1140
872
+ },
873
+ {
874
+ "epoch": 2.1574507966260543,
875
+ "grad_norm": 0.18681703507900238,
876
+ "learning_rate": 0.0007993263922447247,
877
+ "loss": 0.0779,
878
+ "step": 1150
879
+ },
880
+ {
881
+ "epoch": 2.176194939081537,
882
+ "grad_norm": 0.2365262657403946,
883
+ "learning_rate": 0.0007993205347859831,
884
+ "loss": 0.0833,
885
+ "step": 1160
886
+ },
887
+ {
888
+ "epoch": 2.1949390815370196,
889
+ "grad_norm": 0.25459378957748413,
890
+ "learning_rate": 0.0007993146773272417,
891
+ "loss": 0.0761,
892
+ "step": 1170
893
+ },
894
+ {
895
+ "epoch": 2.2136832239925024,
896
+ "grad_norm": 0.39024218916893005,
897
+ "learning_rate": 0.0007993088198685,
898
+ "loss": 0.0873,
899
+ "step": 1180
900
+ },
901
+ {
902
+ "epoch": 2.2324273664479852,
903
+ "grad_norm": 0.3662407100200653,
904
+ "learning_rate": 0.0007993029624097585,
905
+ "loss": 0.0842,
906
+ "step": 1190
907
+ },
908
+ {
909
+ "epoch": 2.2511715089034676,
910
+ "grad_norm": 0.30686551332473755,
911
+ "learning_rate": 0.0007992971049510171,
912
+ "loss": 0.0845,
913
+ "step": 1200
914
+ },
915
+ {
916
+ "epoch": 2.2699156513589505,
917
+ "grad_norm": 0.29860755801200867,
918
+ "learning_rate": 0.0007992912474922755,
919
+ "loss": 0.0806,
920
+ "step": 1210
921
+ },
922
+ {
923
+ "epoch": 2.288659793814433,
924
+ "grad_norm": 0.272029310464859,
925
+ "learning_rate": 0.0007992853900335341,
926
+ "loss": 0.0849,
927
+ "step": 1220
928
+ },
929
+ {
930
+ "epoch": 2.3074039362699157,
931
+ "grad_norm": 0.23034346103668213,
932
+ "learning_rate": 0.0007992795325747924,
933
+ "loss": 0.0873,
934
+ "step": 1230
935
+ },
936
+ {
937
+ "epoch": 2.3261480787253985,
938
+ "grad_norm": 0.38400229811668396,
939
+ "learning_rate": 0.0007992736751160509,
940
+ "loss": 0.0854,
941
+ "step": 1240
942
+ },
943
+ {
944
+ "epoch": 2.344892221180881,
945
+ "grad_norm": 0.2619285583496094,
946
+ "learning_rate": 0.0007992678176573094,
947
+ "loss": 0.0854,
948
+ "step": 1250
949
+ },
950
+ {
951
+ "epoch": 2.344892221180881,
952
+ "eval_loss": 0.0549059733748436,
953
+ "eval_pearson_cosine": 0.7565033435821533,
954
+ "eval_pearson_dot": 0.7438405752182007,
955
+ "eval_pearson_euclidean": 0.7398021221160889,
956
+ "eval_pearson_manhattan": 0.7514023780822754,
957
+ "eval_runtime": 27.2363,
958
+ "eval_samples_per_second": 55.074,
959
+ "eval_spearman_cosine": 0.7657686934808458,
960
+ "eval_spearman_dot": 0.7450125999969373,
961
+ "eval_spearman_euclidean": 0.7411997174627442,
962
+ "eval_spearman_manhattan": 0.754436544283217,
963
+ "eval_steps_per_second": 6.903,
964
+ "step": 1250
965
+ },
966
+ {
967
+ "epoch": 2.3636363636363638,
968
+ "grad_norm": 0.2573038935661316,
969
+ "learning_rate": 0.0007992619601985679,
970
+ "loss": 0.0812,
971
+ "step": 1260
972
+ },
973
+ {
974
+ "epoch": 2.382380506091846,
975
+ "grad_norm": 0.2684009373188019,
976
+ "learning_rate": 0.0007992561027398263,
977
+ "loss": 0.0833,
978
+ "step": 1270
979
+ },
980
+ {
981
+ "epoch": 2.401124648547329,
982
+ "grad_norm": 0.2773861289024353,
983
+ "learning_rate": 0.0007992502452810849,
984
+ "loss": 0.0902,
985
+ "step": 1280
986
+ },
987
+ {
988
+ "epoch": 2.419868791002812,
989
+ "grad_norm": 0.3180435001850128,
990
+ "learning_rate": 0.0007992443878223433,
991
+ "loss": 0.0882,
992
+ "step": 1290
993
+ },
994
+ {
995
+ "epoch": 2.438612933458294,
996
+ "grad_norm": 0.2758583426475525,
997
+ "learning_rate": 0.0007992385303636017,
998
+ "loss": 0.0815,
999
+ "step": 1300
1000
+ },
1001
+ {
1002
+ "epoch": 2.457357075913777,
1003
+ "grad_norm": 0.3327929973602295,
1004
+ "learning_rate": 0.0007992326729048603,
1005
+ "loss": 0.0949,
1006
+ "step": 1310
1007
+ },
1008
+ {
1009
+ "epoch": 2.4761012183692594,
1010
+ "grad_norm": 0.31645268201828003,
1011
+ "learning_rate": 0.0007992268154461187,
1012
+ "loss": 0.0942,
1013
+ "step": 1320
1014
+ },
1015
+ {
1016
+ "epoch": 2.4948453608247423,
1017
+ "grad_norm": 0.2587279975414276,
1018
+ "learning_rate": 0.0007992209579873773,
1019
+ "loss": 0.0889,
1020
+ "step": 1330
1021
+ },
1022
+ {
1023
+ "epoch": 2.513589503280225,
1024
+ "grad_norm": 0.29799187183380127,
1025
+ "learning_rate": 0.0007992151005286357,
1026
+ "loss": 0.1027,
1027
+ "step": 1340
1028
+ },
1029
+ {
1030
+ "epoch": 2.5323336457357075,
1031
+ "grad_norm": 0.3042343258857727,
1032
+ "learning_rate": 0.0007992092430698941,
1033
+ "loss": 0.0947,
1034
+ "step": 1350
1035
+ },
1036
+ {
1037
+ "epoch": 2.5510777881911904,
1038
+ "grad_norm": 0.36439308524131775,
1039
+ "learning_rate": 0.0007992033856111527,
1040
+ "loss": 0.0887,
1041
+ "step": 1360
1042
+ },
1043
+ {
1044
+ "epoch": 2.5698219306466727,
1045
+ "grad_norm": 0.24675941467285156,
1046
+ "learning_rate": 0.0007991975281524111,
1047
+ "loss": 0.0893,
1048
+ "step": 1370
1049
+ },
1050
+ {
1051
+ "epoch": 2.5885660731021556,
1052
+ "grad_norm": 0.3232560157775879,
1053
+ "learning_rate": 0.0007991916706936696,
1054
+ "loss": 0.0949,
1055
+ "step": 1380
1056
+ },
1057
+ {
1058
+ "epoch": 2.6073102155576384,
1059
+ "grad_norm": 0.3095908463001251,
1060
+ "learning_rate": 0.0007991858132349281,
1061
+ "loss": 0.0893,
1062
+ "step": 1390
1063
+ },
1064
+ {
1065
+ "epoch": 2.626054358013121,
1066
+ "grad_norm": 0.24996769428253174,
1067
+ "learning_rate": 0.0007991799557761866,
1068
+ "loss": 0.0918,
1069
+ "step": 1400
1070
+ },
1071
+ {
1072
+ "epoch": 2.6447985004686037,
1073
+ "grad_norm": 0.3013332486152649,
1074
+ "learning_rate": 0.0007991740983174449,
1075
+ "loss": 0.0965,
1076
+ "step": 1410
1077
+ },
1078
+ {
1079
+ "epoch": 2.663542642924086,
1080
+ "grad_norm": 0.43422290682792664,
1081
+ "learning_rate": 0.0007991682408587035,
1082
+ "loss": 0.1144,
1083
+ "step": 1420
1084
+ },
1085
+ {
1086
+ "epoch": 2.682286785379569,
1087
+ "grad_norm": 0.3462458848953247,
1088
+ "learning_rate": 0.0007991623833999619,
1089
+ "loss": 0.1068,
1090
+ "step": 1430
1091
+ },
1092
+ {
1093
+ "epoch": 2.7010309278350517,
1094
+ "grad_norm": 0.2752937078475952,
1095
+ "learning_rate": 0.0007991565259412205,
1096
+ "loss": 0.1048,
1097
+ "step": 1440
1098
+ },
1099
+ {
1100
+ "epoch": 2.719775070290534,
1101
+ "grad_norm": 0.33038660883903503,
1102
+ "learning_rate": 0.000799150668482479,
1103
+ "loss": 0.1055,
1104
+ "step": 1450
1105
+ },
1106
+ {
1107
+ "epoch": 2.738519212746017,
1108
+ "grad_norm": 0.28442054986953735,
1109
+ "learning_rate": 0.0007991448110237373,
1110
+ "loss": 0.1053,
1111
+ "step": 1460
1112
+ },
1113
+ {
1114
+ "epoch": 2.7572633552014993,
1115
+ "grad_norm": 0.25279343128204346,
1116
+ "learning_rate": 0.0007991389535649959,
1117
+ "loss": 0.109,
1118
+ "step": 1470
1119
+ },
1120
+ {
1121
+ "epoch": 2.776007497656982,
1122
+ "grad_norm": 0.3681808114051819,
1123
+ "learning_rate": 0.0007991330961062543,
1124
+ "loss": 0.1092,
1125
+ "step": 1480
1126
+ },
1127
+ {
1128
+ "epoch": 2.794751640112465,
1129
+ "grad_norm": 0.3884279429912567,
1130
+ "learning_rate": 0.0007991272386475128,
1131
+ "loss": 0.1105,
1132
+ "step": 1490
1133
+ },
1134
+ {
1135
+ "epoch": 2.8134957825679474,
1136
+ "grad_norm": 0.3542380928993225,
1137
+ "learning_rate": 0.0007991213811887713,
1138
+ "loss": 0.109,
1139
+ "step": 1500
1140
+ },
1141
+ {
1142
+ "epoch": 2.8134957825679474,
1143
+ "eval_loss": 0.06194353476166725,
1144
+ "eval_pearson_cosine": 0.7544945478439331,
1145
+ "eval_pearson_dot": 0.7297648787498474,
1146
+ "eval_pearson_euclidean": 0.7457708120346069,
1147
+ "eval_pearson_manhattan": 0.7537869215011597,
1148
+ "eval_runtime": 27.28,
1149
+ "eval_samples_per_second": 54.985,
1150
+ "eval_spearman_cosine": 0.7677406665753612,
1151
+ "eval_spearman_dot": 0.7355031880736892,
1152
+ "eval_spearman_euclidean": 0.752266788615453,
1153
+ "eval_spearman_manhattan": 0.7620929193607933,
1154
+ "eval_steps_per_second": 6.892,
1155
+ "step": 1500
1156
+ },
1157
+ {
1158
+ "epoch": 2.8322399250234302,
1159
+ "grad_norm": 0.28738659620285034,
1160
+ "learning_rate": 0.0007991155237300298,
1161
+ "loss": 0.1043,
1162
+ "step": 1510
1163
+ },
1164
+ {
1165
+ "epoch": 2.8509840674789126,
1166
+ "grad_norm": 0.39117714762687683,
1167
+ "learning_rate": 0.0007991096662712882,
1168
+ "loss": 0.0993,
1169
+ "step": 1520
1170
+ },
1171
+ {
1172
+ "epoch": 2.8697282099343955,
1173
+ "grad_norm": 0.3144415616989136,
1174
+ "learning_rate": 0.0007991038088125467,
1175
+ "loss": 0.1145,
1176
+ "step": 1530
1177
+ },
1178
+ {
1179
+ "epoch": 2.8884723523898783,
1180
+ "grad_norm": 0.28154823184013367,
1181
+ "learning_rate": 0.0007990979513538052,
1182
+ "loss": 0.1128,
1183
+ "step": 1540
1184
+ },
1185
+ {
1186
+ "epoch": 2.9072164948453607,
1187
+ "grad_norm": 0.3766768276691437,
1188
+ "learning_rate": 0.0007990920938950637,
1189
+ "loss": 0.1033,
1190
+ "step": 1550
1191
+ },
1192
+ {
1193
+ "epoch": 2.9259606373008435,
1194
+ "grad_norm": 0.38604792952537537,
1195
+ "learning_rate": 0.0007990862364363222,
1196
+ "loss": 0.1044,
1197
+ "step": 1560
1198
+ },
1199
+ {
1200
+ "epoch": 2.944704779756326,
1201
+ "grad_norm": 0.36833906173706055,
1202
+ "learning_rate": 0.0007990803789775806,
1203
+ "loss": 0.1159,
1204
+ "step": 1570
1205
+ },
1206
+ {
1207
+ "epoch": 2.963448922211809,
1208
+ "grad_norm": 0.3357650935649872,
1209
+ "learning_rate": 0.0007990745215188391,
1210
+ "loss": 0.1185,
1211
+ "step": 1580
1212
+ },
1213
+ {
1214
+ "epoch": 2.9821930646672916,
1215
+ "grad_norm": 0.30260348320007324,
1216
+ "learning_rate": 0.0007990686640600976,
1217
+ "loss": 0.1167,
1218
+ "step": 1590
1219
+ },
1220
+ {
1221
+ "epoch": 3.0018744142455485,
1222
+ "grad_norm": 0.28110650181770325,
1223
+ "learning_rate": 0.000799062806601356,
1224
+ "loss": 0.1115,
1225
+ "step": 1600
1226
+ },
1227
+ {
1228
+ "epoch": 3.020618556701031,
1229
+ "grad_norm": 0.32038745284080505,
1230
+ "learning_rate": 0.0007990569491426146,
1231
+ "loss": 0.0637,
1232
+ "step": 1610
1233
+ },
1234
+ {
1235
+ "epoch": 3.0393626991565137,
1236
+ "grad_norm": 0.29342755675315857,
1237
+ "learning_rate": 0.000799051091683873,
1238
+ "loss": 0.0687,
1239
+ "step": 1620
1240
+ },
1241
+ {
1242
+ "epoch": 3.058106841611996,
1243
+ "grad_norm": 0.33964619040489197,
1244
+ "learning_rate": 0.0007990452342251314,
1245
+ "loss": 0.0611,
1246
+ "step": 1630
1247
+ },
1248
+ {
1249
+ "epoch": 3.076850984067479,
1250
+ "grad_norm": 0.23580531775951385,
1251
+ "learning_rate": 0.0007990393767663899,
1252
+ "loss": 0.0635,
1253
+ "step": 1640
1254
+ },
1255
+ {
1256
+ "epoch": 3.0955951265229618,
1257
+ "grad_norm": 0.2617776393890381,
1258
+ "learning_rate": 0.0007990335193076484,
1259
+ "loss": 0.0709,
1260
+ "step": 1650
1261
+ },
1262
+ {
1263
+ "epoch": 3.114339268978444,
1264
+ "grad_norm": 0.25627410411834717,
1265
+ "learning_rate": 0.0007990276618489068,
1266
+ "loss": 0.0682,
1267
+ "step": 1660
1268
+ },
1269
+ {
1270
+ "epoch": 3.133083411433927,
1271
+ "grad_norm": 0.21987001597881317,
1272
+ "learning_rate": 0.0007990218043901654,
1273
+ "loss": 0.06,
1274
+ "step": 1670
1275
+ },
1276
+ {
1277
+ "epoch": 3.1518275538894094,
1278
+ "grad_norm": 0.2657093405723572,
1279
+ "learning_rate": 0.0007990159469314238,
1280
+ "loss": 0.0712,
1281
+ "step": 1680
1282
+ },
1283
+ {
1284
+ "epoch": 3.170571696344892,
1285
+ "grad_norm": 0.23929661512374878,
1286
+ "learning_rate": 0.0007990100894726823,
1287
+ "loss": 0.0566,
1288
+ "step": 1690
1289
+ },
1290
+ {
1291
+ "epoch": 3.189315838800375,
1292
+ "grad_norm": 0.23572145402431488,
1293
+ "learning_rate": 0.0007990042320139408,
1294
+ "loss": 0.0571,
1295
+ "step": 1700
1296
+ },
1297
+ {
1298
+ "epoch": 3.2080599812558575,
1299
+ "grad_norm": 0.26287132501602173,
1300
+ "learning_rate": 0.0007989983745551992,
1301
+ "loss": 0.067,
1302
+ "step": 1710
1303
+ },
1304
+ {
1305
+ "epoch": 3.2268041237113403,
1306
+ "grad_norm": 0.24504464864730835,
1307
+ "learning_rate": 0.0007989925170964578,
1308
+ "loss": 0.0637,
1309
+ "step": 1720
1310
+ },
1311
+ {
1312
+ "epoch": 3.2455482661668227,
1313
+ "grad_norm": 0.17006747424602509,
1314
+ "learning_rate": 0.0007989866596377162,
1315
+ "loss": 0.0552,
1316
+ "step": 1730
1317
+ },
1318
+ {
1319
+ "epoch": 3.2642924086223055,
1320
+ "grad_norm": 0.2752683460712433,
1321
+ "learning_rate": 0.0007989808021789747,
1322
+ "loss": 0.0639,
1323
+ "step": 1740
1324
+ },
1325
+ {
1326
+ "epoch": 3.2830365510777884,
1327
+ "grad_norm": 0.2681417465209961,
1328
+ "learning_rate": 0.0007989749447202332,
1329
+ "loss": 0.0705,
1330
+ "step": 1750
1331
+ },
1332
+ {
1333
+ "epoch": 3.2830365510777884,
1334
+ "eval_loss": 0.0486464686691761,
1335
+ "eval_pearson_cosine": 0.7632350921630859,
1336
+ "eval_pearson_dot": 0.7505504488945007,
1337
+ "eval_pearson_euclidean": 0.7458865642547607,
1338
+ "eval_pearson_manhattan": 0.7597954273223877,
1339
+ "eval_runtime": 27.3673,
1340
+ "eval_samples_per_second": 54.81,
1341
+ "eval_spearman_cosine": 0.7679814031707208,
1342
+ "eval_spearman_dot": 0.7517654374212466,
1343
+ "eval_spearman_euclidean": 0.7467275015139031,
1344
+ "eval_spearman_manhattan": 0.7607208640788498,
1345
+ "eval_steps_per_second": 6.87,
1346
+ "step": 1750
1347
+ },
1348
+ {
1349
+ "epoch": 3.3017806935332707,
1350
+ "grad_norm": 0.24346262216567993,
1351
+ "learning_rate": 0.0007989690872614916,
1352
+ "loss": 0.0658,
1353
+ "step": 1760
1354
+ },
1355
+ {
1356
+ "epoch": 3.3205248359887536,
1357
+ "grad_norm": 0.24957306683063507,
1358
+ "learning_rate": 0.0007989632298027502,
1359
+ "loss": 0.0643,
1360
+ "step": 1770
1361
+ },
1362
+ {
1363
+ "epoch": 3.3392689784442364,
1364
+ "grad_norm": 0.24416255950927734,
1365
+ "learning_rate": 0.0007989573723440086,
1366
+ "loss": 0.0626,
1367
+ "step": 1780
1368
+ },
1369
+ {
1370
+ "epoch": 3.358013120899719,
1371
+ "grad_norm": 0.2224712073802948,
1372
+ "learning_rate": 0.0007989515148852671,
1373
+ "loss": 0.0634,
1374
+ "step": 1790
1375
+ },
1376
+ {
1377
+ "epoch": 3.3767572633552017,
1378
+ "grad_norm": 0.27588558197021484,
1379
+ "learning_rate": 0.0007989456574265256,
1380
+ "loss": 0.0644,
1381
+ "step": 1800
1382
+ },
1383
+ {
1384
+ "epoch": 3.395501405810684,
1385
+ "grad_norm": 0.26377061009407043,
1386
+ "learning_rate": 0.000798939799967784,
1387
+ "loss": 0.0585,
1388
+ "step": 1810
1389
+ },
1390
+ {
1391
+ "epoch": 3.414245548266167,
1392
+ "grad_norm": 0.23178541660308838,
1393
+ "learning_rate": 0.0007989339425090424,
1394
+ "loss": 0.0588,
1395
+ "step": 1820
1396
+ },
1397
+ {
1398
+ "epoch": 3.4329896907216497,
1399
+ "grad_norm": 0.1893617808818817,
1400
+ "learning_rate": 0.000798928085050301,
1401
+ "loss": 0.0649,
1402
+ "step": 1830
1403
+ },
1404
+ {
1405
+ "epoch": 3.451733833177132,
1406
+ "grad_norm": 0.23445335030555725,
1407
+ "learning_rate": 0.0007989222275915595,
1408
+ "loss": 0.0629,
1409
+ "step": 1840
1410
+ },
1411
+ {
1412
+ "epoch": 3.470477975632615,
1413
+ "grad_norm": 0.457109659910202,
1414
+ "learning_rate": 0.0007989163701328179,
1415
+ "loss": 0.0646,
1416
+ "step": 1850
1417
+ },
1418
+ {
1419
+ "epoch": 3.4892221180880973,
1420
+ "grad_norm": 0.2316947728395462,
1421
+ "learning_rate": 0.0007989105126740764,
1422
+ "loss": 0.0677,
1423
+ "step": 1860
1424
+ },
1425
+ {
1426
+ "epoch": 3.50796626054358,
1427
+ "grad_norm": 0.26950669288635254,
1428
+ "learning_rate": 0.0007989046552153348,
1429
+ "loss": 0.0732,
1430
+ "step": 1870
1431
+ },
1432
+ {
1433
+ "epoch": 3.526710402999063,
1434
+ "grad_norm": 0.25258171558380127,
1435
+ "learning_rate": 0.0007988987977565933,
1436
+ "loss": 0.0635,
1437
+ "step": 1880
1438
+ },
1439
+ {
1440
+ "epoch": 3.5454545454545454,
1441
+ "grad_norm": 0.2282831370830536,
1442
+ "learning_rate": 0.0007988929402978518,
1443
+ "loss": 0.0766,
1444
+ "step": 1890
1445
+ },
1446
+ {
1447
+ "epoch": 3.5641986879100283,
1448
+ "grad_norm": 0.3049706220626831,
1449
+ "learning_rate": 0.0007988870828391103,
1450
+ "loss": 0.0766,
1451
+ "step": 1900
1452
+ },
1453
+ {
1454
+ "epoch": 3.5829428303655106,
1455
+ "grad_norm": 0.21556228399276733,
1456
+ "learning_rate": 0.0007988812253803688,
1457
+ "loss": 0.0694,
1458
+ "step": 1910
1459
+ },
1460
+ {
1461
+ "epoch": 3.6016869728209935,
1462
+ "grad_norm": 0.2859863340854645,
1463
+ "learning_rate": 0.0007988753679216272,
1464
+ "loss": 0.0665,
1465
+ "step": 1920
1466
+ },
1467
+ {
1468
+ "epoch": 3.6204311152764763,
1469
+ "grad_norm": 0.22522784769535065,
1470
+ "learning_rate": 0.0007988695104628857,
1471
+ "loss": 0.073,
1472
+ "step": 1930
1473
+ },
1474
+ {
1475
+ "epoch": 3.6391752577319587,
1476
+ "grad_norm": 0.3301334083080292,
1477
+ "learning_rate": 0.0007988636530041442,
1478
+ "loss": 0.0745,
1479
+ "step": 1940
1480
+ },
1481
+ {
1482
+ "epoch": 3.6579194001874415,
1483
+ "grad_norm": 0.21438319981098175,
1484
+ "learning_rate": 0.0007988577955454027,
1485
+ "loss": 0.0713,
1486
+ "step": 1950
1487
+ },
1488
+ {
1489
+ "epoch": 3.676663542642924,
1490
+ "grad_norm": 0.3207626938819885,
1491
+ "learning_rate": 0.0007988519380866611,
1492
+ "loss": 0.0759,
1493
+ "step": 1960
1494
+ },
1495
+ {
1496
+ "epoch": 3.695407685098407,
1497
+ "grad_norm": 0.25493231415748596,
1498
+ "learning_rate": 0.0007988460806279197,
1499
+ "loss": 0.0722,
1500
+ "step": 1970
1501
+ },
1502
+ {
1503
+ "epoch": 3.7141518275538896,
1504
+ "grad_norm": 0.2732018530368805,
1505
+ "learning_rate": 0.0007988402231691781,
1506
+ "loss": 0.0773,
1507
+ "step": 1980
1508
+ },
1509
+ {
1510
+ "epoch": 3.732895970009372,
1511
+ "grad_norm": 0.19611899554729462,
1512
+ "learning_rate": 0.0007988343657104365,
1513
+ "loss": 0.0773,
1514
+ "step": 1990
1515
+ },
1516
+ {
1517
+ "epoch": 3.751640112464855,
1518
+ "grad_norm": 0.2664394676685333,
1519
+ "learning_rate": 0.0007988285082516951,
1520
+ "loss": 0.072,
1521
+ "step": 2000
1522
+ },
1523
+ {
1524
+ "epoch": 3.751640112464855,
1525
+ "eval_loss": 0.05059043690562248,
1526
+ "eval_pearson_cosine": 0.7549334764480591,
1527
+ "eval_pearson_dot": 0.7364022731781006,
1528
+ "eval_pearson_euclidean": 0.7430644035339355,
1529
+ "eval_pearson_manhattan": 0.7528964281082153,
1530
+ "eval_runtime": 27.2774,
1531
+ "eval_samples_per_second": 54.991,
1532
+ "eval_spearman_cosine": 0.7612361982335023,
1533
+ "eval_spearman_dot": 0.7370856746295986,
1534
+ "eval_spearman_euclidean": 0.7449844586260276,
1535
+ "eval_spearman_manhattan": 0.7551494271561938,
1536
+ "eval_steps_per_second": 6.892,
1537
+ "step": 2000
1538
  }
1539
  ],
1540
  "logging_steps": 10,