CocoRoF commited on
Commit
394503e
·
verified ·
1 Parent(s): d26c45d

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c854e311156a00fb209c1d5b18bf088757f9e875811a4af0292d4b051e6c6446
3
  size 735217848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74572018ac522fe8bf3b91fec5b2e11a917f01268eb7fa79d8c28e82716c3641
3
  size 735217848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de307672e8d7a91febef716662f5657dacb787fa03178bc23fb37badcafa7ed1
3
  size 1470521978
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaf6e503e47ad9a45b9fb2dbb896936f1d179d52c20c550add65c77f44a8193c
3
  size 1470521978
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:038ef74d9d7647e927602a31e3ff40ed015ce2147efee9b81efc43a4be3f559b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5672d4a2bab2f5ec1b202aa86f336deecf9ade33ecc3e9f1ae101d08c2403c85
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fca3c514eb217652ef846414a7b25fe2d542ec928f14020a84d1e47090ecb880
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dfb306257217b253dcb010fa8e7db4904b5a105fb159cc7b2977c1d185fc223
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.46860356138706655,
5
  "eval_steps": 100,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -867,6 +867,436 @@
867
  "eval_spearman_manhattan": 0.8284194308491212,
868
  "eval_steps_per_second": 15.014,
869
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
870
  }
871
  ],
872
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7029053420805998,
5
  "eval_steps": 100,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
867
  "eval_spearman_manhattan": 0.8284194308491212,
868
  "eval_steps_per_second": 15.014,
869
  "step": 1000
870
+ },
871
+ {
872
+ "epoch": 0.4732895970009372,
873
+ "grad_norm": 2.064291477203369,
874
+ "learning_rate": 4.7041940018744145e-05,
875
+ "loss": 0.3188,
876
+ "step": 1010
877
+ },
878
+ {
879
+ "epoch": 0.47797563261480785,
880
+ "grad_norm": 1.338891625404358,
881
+ "learning_rate": 4.7012652296157454e-05,
882
+ "loss": 0.3381,
883
+ "step": 1020
884
+ },
885
+ {
886
+ "epoch": 0.48266166822867856,
887
+ "grad_norm": 1.4479578733444214,
888
+ "learning_rate": 4.698336457357076e-05,
889
+ "loss": 0.3187,
890
+ "step": 1030
891
+ },
892
+ {
893
+ "epoch": 0.4873477038425492,
894
+ "grad_norm": 2.0868189334869385,
895
+ "learning_rate": 4.695407685098407e-05,
896
+ "loss": 0.3697,
897
+ "step": 1040
898
+ },
899
+ {
900
+ "epoch": 0.49203373945641987,
901
+ "grad_norm": 1.9820175170898438,
902
+ "learning_rate": 4.692478912839738e-05,
903
+ "loss": 0.3079,
904
+ "step": 1050
905
+ },
906
+ {
907
+ "epoch": 0.4967197750702905,
908
+ "grad_norm": 1.2479910850524902,
909
+ "learning_rate": 4.689550140581069e-05,
910
+ "loss": 0.3129,
911
+ "step": 1060
912
+ },
913
+ {
914
+ "epoch": 0.5014058106841612,
915
+ "grad_norm": 1.5005191564559937,
916
+ "learning_rate": 4.6866213683223996e-05,
917
+ "loss": 0.3588,
918
+ "step": 1070
919
+ },
920
+ {
921
+ "epoch": 0.5060918462980318,
922
+ "grad_norm": 1.730153203010559,
923
+ "learning_rate": 4.68369259606373e-05,
924
+ "loss": 0.3511,
925
+ "step": 1080
926
+ },
927
+ {
928
+ "epoch": 0.5107778819119025,
929
+ "grad_norm": 1.8256272077560425,
930
+ "learning_rate": 4.680763823805061e-05,
931
+ "loss": 0.3483,
932
+ "step": 1090
933
+ },
934
+ {
935
+ "epoch": 0.5154639175257731,
936
+ "grad_norm": 1.8275713920593262,
937
+ "learning_rate": 4.677835051546392e-05,
938
+ "loss": 0.3301,
939
+ "step": 1100
940
+ },
941
+ {
942
+ "epoch": 0.5154639175257731,
943
+ "eval_loss": 0.06174962595105171,
944
+ "eval_pearson_cosine": 0.8116036079365685,
945
+ "eval_pearson_dot": 0.7637833872485942,
946
+ "eval_pearson_euclidean": 0.8160585823410784,
947
+ "eval_pearson_manhattan": 0.8176746469698344,
948
+ "eval_runtime": 6.1383,
949
+ "eval_samples_per_second": 244.369,
950
+ "eval_spearman_cosine": 0.8149952741898824,
951
+ "eval_spearman_dot": 0.759784369983796,
952
+ "eval_spearman_euclidean": 0.8211588786730816,
953
+ "eval_spearman_manhattan": 0.8228445193252625,
954
+ "eval_steps_per_second": 15.314,
955
+ "step": 1100
956
+ },
957
+ {
958
+ "epoch": 0.5201499531396439,
959
+ "grad_norm": 1.7808656692504883,
960
+ "learning_rate": 4.674906279287723e-05,
961
+ "loss": 0.3378,
962
+ "step": 1110
963
+ },
964
+ {
965
+ "epoch": 0.5248359887535146,
966
+ "grad_norm": 1.3912303447723389,
967
+ "learning_rate": 4.671977507029054e-05,
968
+ "loss": 0.3247,
969
+ "step": 1120
970
+ },
971
+ {
972
+ "epoch": 0.5295220243673852,
973
+ "grad_norm": 1.619547724723816,
974
+ "learning_rate": 4.669048734770384e-05,
975
+ "loss": 0.3548,
976
+ "step": 1130
977
+ },
978
+ {
979
+ "epoch": 0.5342080599812559,
980
+ "grad_norm": 1.6785143613815308,
981
+ "learning_rate": 4.666119962511715e-05,
982
+ "loss": 0.4056,
983
+ "step": 1140
984
+ },
985
+ {
986
+ "epoch": 0.5388940955951266,
987
+ "grad_norm": 1.4282417297363281,
988
+ "learning_rate": 4.6631911902530465e-05,
989
+ "loss": 0.3136,
990
+ "step": 1150
991
+ },
992
+ {
993
+ "epoch": 0.5435801312089972,
994
+ "grad_norm": 1.5950373411178589,
995
+ "learning_rate": 4.660262417994377e-05,
996
+ "loss": 0.3094,
997
+ "step": 1160
998
+ },
999
+ {
1000
+ "epoch": 0.5482661668228679,
1001
+ "grad_norm": 1.9235565662384033,
1002
+ "learning_rate": 4.657333645735708e-05,
1003
+ "loss": 0.3409,
1004
+ "step": 1170
1005
+ },
1006
+ {
1007
+ "epoch": 0.5529522024367385,
1008
+ "grad_norm": 1.2192574739456177,
1009
+ "learning_rate": 4.6544048734770383e-05,
1010
+ "loss": 0.3387,
1011
+ "step": 1180
1012
+ },
1013
+ {
1014
+ "epoch": 0.5576382380506092,
1015
+ "grad_norm": 1.5550990104675293,
1016
+ "learning_rate": 4.651476101218369e-05,
1017
+ "loss": 0.3184,
1018
+ "step": 1190
1019
+ },
1020
+ {
1021
+ "epoch": 0.5623242736644799,
1022
+ "grad_norm": 1.8576079607009888,
1023
+ "learning_rate": 4.6485473289597e-05,
1024
+ "loss": 0.3637,
1025
+ "step": 1200
1026
+ },
1027
+ {
1028
+ "epoch": 0.5623242736644799,
1029
+ "eval_loss": 0.05324321612715721,
1030
+ "eval_pearson_cosine": 0.8107929850304814,
1031
+ "eval_pearson_dot": 0.768063847349957,
1032
+ "eval_pearson_euclidean": 0.8155502077488883,
1033
+ "eval_pearson_manhattan": 0.8174981555238503,
1034
+ "eval_runtime": 5.9644,
1035
+ "eval_samples_per_second": 251.492,
1036
+ "eval_spearman_cosine": 0.8145222586962418,
1037
+ "eval_spearman_dot": 0.7642997219390888,
1038
+ "eval_spearman_euclidean": 0.8201735536723759,
1039
+ "eval_spearman_manhattan": 0.8222186632592043,
1040
+ "eval_steps_per_second": 15.76,
1041
+ "step": 1200
1042
+ },
1043
+ {
1044
+ "epoch": 0.5670103092783505,
1045
+ "grad_norm": 1.41835618019104,
1046
+ "learning_rate": 4.6456185567010316e-05,
1047
+ "loss": 0.3039,
1048
+ "step": 1210
1049
+ },
1050
+ {
1051
+ "epoch": 0.5716963448922212,
1052
+ "grad_norm": 1.3850994110107422,
1053
+ "learning_rate": 4.6426897844423624e-05,
1054
+ "loss": 0.3187,
1055
+ "step": 1220
1056
+ },
1057
+ {
1058
+ "epoch": 0.5763823805060918,
1059
+ "grad_norm": 1.437373399734497,
1060
+ "learning_rate": 4.6397610121836926e-05,
1061
+ "loss": 0.3337,
1062
+ "step": 1230
1063
+ },
1064
+ {
1065
+ "epoch": 0.5810684161199625,
1066
+ "grad_norm": 1.2328146696090698,
1067
+ "learning_rate": 4.6368322399250235e-05,
1068
+ "loss": 0.2975,
1069
+ "step": 1240
1070
+ },
1071
+ {
1072
+ "epoch": 0.5857544517338332,
1073
+ "grad_norm": 1.6191329956054688,
1074
+ "learning_rate": 4.633903467666354e-05,
1075
+ "loss": 0.3275,
1076
+ "step": 1250
1077
+ },
1078
+ {
1079
+ "epoch": 0.5904404873477038,
1080
+ "grad_norm": 1.695470929145813,
1081
+ "learning_rate": 4.630974695407685e-05,
1082
+ "loss": 0.3485,
1083
+ "step": 1260
1084
+ },
1085
+ {
1086
+ "epoch": 0.5951265229615745,
1087
+ "grad_norm": 1.6120591163635254,
1088
+ "learning_rate": 4.628045923149017e-05,
1089
+ "loss": 0.3515,
1090
+ "step": 1270
1091
+ },
1092
+ {
1093
+ "epoch": 0.5998125585754451,
1094
+ "grad_norm": 1.6157792806625366,
1095
+ "learning_rate": 4.625117150890347e-05,
1096
+ "loss": 0.3043,
1097
+ "step": 1280
1098
+ },
1099
+ {
1100
+ "epoch": 0.6044985941893158,
1101
+ "grad_norm": 1.1465294361114502,
1102
+ "learning_rate": 4.622188378631678e-05,
1103
+ "loss": 0.2884,
1104
+ "step": 1290
1105
+ },
1106
+ {
1107
+ "epoch": 0.6091846298031866,
1108
+ "grad_norm": 1.583688497543335,
1109
+ "learning_rate": 4.6192596063730086e-05,
1110
+ "loss": 0.2885,
1111
+ "step": 1300
1112
+ },
1113
+ {
1114
+ "epoch": 0.6091846298031866,
1115
+ "eval_loss": 0.04511857405304909,
1116
+ "eval_pearson_cosine": 0.8272008065998051,
1117
+ "eval_pearson_dot": 0.7924789666171037,
1118
+ "eval_pearson_euclidean": 0.8268065548805623,
1119
+ "eval_pearson_manhattan": 0.827500349653536,
1120
+ "eval_runtime": 6.0244,
1121
+ "eval_samples_per_second": 248.988,
1122
+ "eval_spearman_cosine": 0.8278338594350843,
1123
+ "eval_spearman_dot": 0.7887595412839734,
1124
+ "eval_spearman_euclidean": 0.8317669408319824,
1125
+ "eval_spearman_manhattan": 0.8323949761116776,
1126
+ "eval_steps_per_second": 15.603,
1127
+ "step": 1300
1128
+ },
1129
+ {
1130
+ "epoch": 0.6138706654170571,
1131
+ "grad_norm": 1.081640362739563,
1132
+ "learning_rate": 4.6163308341143395e-05,
1133
+ "loss": 0.3526,
1134
+ "step": 1310
1135
+ },
1136
+ {
1137
+ "epoch": 0.6185567010309279,
1138
+ "grad_norm": 1.760512351989746,
1139
+ "learning_rate": 4.61340206185567e-05,
1140
+ "loss": 0.3113,
1141
+ "step": 1320
1142
+ },
1143
+ {
1144
+ "epoch": 0.6232427366447985,
1145
+ "grad_norm": 1.106444239616394,
1146
+ "learning_rate": 4.610473289597001e-05,
1147
+ "loss": 0.3126,
1148
+ "step": 1330
1149
+ },
1150
+ {
1151
+ "epoch": 0.6279287722586692,
1152
+ "grad_norm": 1.3500837087631226,
1153
+ "learning_rate": 4.607544517338332e-05,
1154
+ "loss": 0.3094,
1155
+ "step": 1340
1156
+ },
1157
+ {
1158
+ "epoch": 0.6326148078725399,
1159
+ "grad_norm": 1.727953553199768,
1160
+ "learning_rate": 4.604615745079663e-05,
1161
+ "loss": 0.3304,
1162
+ "step": 1350
1163
+ },
1164
+ {
1165
+ "epoch": 0.6373008434864105,
1166
+ "grad_norm": 1.4341022968292236,
1167
+ "learning_rate": 4.601686972820994e-05,
1168
+ "loss": 0.2804,
1169
+ "step": 1360
1170
+ },
1171
+ {
1172
+ "epoch": 0.6419868791002812,
1173
+ "grad_norm": 1.4479708671569824,
1174
+ "learning_rate": 4.5987582005623246e-05,
1175
+ "loss": 0.31,
1176
+ "step": 1370
1177
+ },
1178
+ {
1179
+ "epoch": 0.6466729147141518,
1180
+ "grad_norm": 1.5667890310287476,
1181
+ "learning_rate": 4.5958294283036554e-05,
1182
+ "loss": 0.3149,
1183
+ "step": 1380
1184
+ },
1185
+ {
1186
+ "epoch": 0.6513589503280225,
1187
+ "grad_norm": 1.7333146333694458,
1188
+ "learning_rate": 4.592900656044986e-05,
1189
+ "loss": 0.3247,
1190
+ "step": 1390
1191
+ },
1192
+ {
1193
+ "epoch": 0.6560449859418932,
1194
+ "grad_norm": 1.914392113685608,
1195
+ "learning_rate": 4.589971883786317e-05,
1196
+ "loss": 0.2852,
1197
+ "step": 1400
1198
+ },
1199
+ {
1200
+ "epoch": 0.6560449859418932,
1201
+ "eval_loss": 0.04731455817818642,
1202
+ "eval_pearson_cosine": 0.8245641713392331,
1203
+ "eval_pearson_dot": 0.7893189374890994,
1204
+ "eval_pearson_euclidean": 0.8220644314223797,
1205
+ "eval_pearson_manhattan": 0.8227839674683928,
1206
+ "eval_runtime": 6.0521,
1207
+ "eval_samples_per_second": 247.846,
1208
+ "eval_spearman_cosine": 0.8264178003782281,
1209
+ "eval_spearman_dot": 0.7874134051082518,
1210
+ "eval_spearman_euclidean": 0.8274821508565314,
1211
+ "eval_spearman_manhattan": 0.8280999297389011,
1212
+ "eval_steps_per_second": 15.532,
1213
+ "step": 1400
1214
+ },
1215
+ {
1216
+ "epoch": 0.6607310215557638,
1217
+ "grad_norm": 1.2458995580673218,
1218
+ "learning_rate": 4.587043111527648e-05,
1219
+ "loss": 0.3068,
1220
+ "step": 1410
1221
+ },
1222
+ {
1223
+ "epoch": 0.6654170571696345,
1224
+ "grad_norm": 1.6540151834487915,
1225
+ "learning_rate": 4.584114339268979e-05,
1226
+ "loss": 0.3034,
1227
+ "step": 1420
1228
+ },
1229
+ {
1230
+ "epoch": 0.6701030927835051,
1231
+ "grad_norm": 1.2585715055465698,
1232
+ "learning_rate": 4.581185567010309e-05,
1233
+ "loss": 0.3297,
1234
+ "step": 1430
1235
+ },
1236
+ {
1237
+ "epoch": 0.6747891283973758,
1238
+ "grad_norm": 1.5088609457015991,
1239
+ "learning_rate": 4.5782567947516406e-05,
1240
+ "loss": 0.3444,
1241
+ "step": 1440
1242
+ },
1243
+ {
1244
+ "epoch": 0.6794751640112465,
1245
+ "grad_norm": 1.3120390176773071,
1246
+ "learning_rate": 4.5753280224929714e-05,
1247
+ "loss": 0.2882,
1248
+ "step": 1450
1249
+ },
1250
+ {
1251
+ "epoch": 0.6841611996251171,
1252
+ "grad_norm": 1.1074262857437134,
1253
+ "learning_rate": 4.572399250234302e-05,
1254
+ "loss": 0.287,
1255
+ "step": 1460
1256
+ },
1257
+ {
1258
+ "epoch": 0.6888472352389878,
1259
+ "grad_norm": 1.5284086465835571,
1260
+ "learning_rate": 4.569470477975633e-05,
1261
+ "loss": 0.3175,
1262
+ "step": 1470
1263
+ },
1264
+ {
1265
+ "epoch": 0.6935332708528584,
1266
+ "grad_norm": 1.5610471963882446,
1267
+ "learning_rate": 4.566541705716963e-05,
1268
+ "loss": 0.3033,
1269
+ "step": 1480
1270
+ },
1271
+ {
1272
+ "epoch": 0.6982193064667291,
1273
+ "grad_norm": 1.1839112043380737,
1274
+ "learning_rate": 4.563612933458294e-05,
1275
+ "loss": 0.2917,
1276
+ "step": 1490
1277
+ },
1278
+ {
1279
+ "epoch": 0.7029053420805998,
1280
+ "grad_norm": 1.2611138820648193,
1281
+ "learning_rate": 4.560684161199626e-05,
1282
+ "loss": 0.3225,
1283
+ "step": 1500
1284
+ },
1285
+ {
1286
+ "epoch": 0.7029053420805998,
1287
+ "eval_loss": 0.05073302239179611,
1288
+ "eval_pearson_cosine": 0.8258767050389224,
1289
+ "eval_pearson_dot": 0.7737186817222579,
1290
+ "eval_pearson_euclidean": 0.826299226589029,
1291
+ "eval_pearson_manhattan": 0.8274116157485736,
1292
+ "eval_runtime": 6.1756,
1293
+ "eval_samples_per_second": 242.892,
1294
+ "eval_spearman_cosine": 0.828410027637777,
1295
+ "eval_spearman_dot": 0.7707599871747091,
1296
+ "eval_spearman_euclidean": 0.8324611004376368,
1297
+ "eval_spearman_manhattan": 0.8335300441487923,
1298
+ "eval_steps_per_second": 15.221,
1299
+ "step": 1500
1300
  }
1301
  ],
1302
  "logging_steps": 10,