Training in progress, step 1800, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3237818848
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30cf4ddc8138dc0b63c04cf5856ccaefc44f54d57161548a2bcf67587713dfed
|
| 3 |
size 3237818848
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2062251569
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c88d4612f6436cb0270beb0bb2ab7cbb57317eafb7b87764e12d36ec083c260
|
| 3 |
size 2062251569
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3b789b883f13ca849e56997deda5a819a4b325b5d103e882990a667f22165d3
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1547aae10ac7691e1716f567b08e3b4d274fa923879a48af8c2bb55c815a28a2
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f5a59feb5a16bc7cf6785205b16a58a4ce06c6d1cd586567a10fcc2307ab6fc
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1070,6 +1070,216 @@
|
|
| 1070 |
"learning_rate": 5.231958800515164e-05,
|
| 1071 |
"loss": 1.0044,
|
| 1072 |
"step": 1500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1073 |
}
|
| 1074 |
],
|
| 1075 |
"logging_steps": 10,
|
|
@@ -1089,7 +1299,7 @@
|
|
| 1089 |
"attributes": {}
|
| 1090 |
}
|
| 1091 |
},
|
| 1092 |
-
"total_flos": 4.
|
| 1093 |
"train_batch_size": 4,
|
| 1094 |
"trial_name": null,
|
| 1095 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.8,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 1800,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1070 |
"learning_rate": 5.231958800515164e-05,
|
| 1071 |
"loss": 1.0044,
|
| 1072 |
"step": 1500
|
| 1073 |
+
},
|
| 1074 |
+
{
|
| 1075 |
+
"epoch": 0.6711111111111111,
|
| 1076 |
+
"grad_norm": 7.949609279632568,
|
| 1077 |
+
"learning_rate": 5.107265654859855e-05,
|
| 1078 |
+
"loss": 1.0194,
|
| 1079 |
+
"step": 1510
|
| 1080 |
+
},
|
| 1081 |
+
{
|
| 1082 |
+
"epoch": 0.6755555555555556,
|
| 1083 |
+
"grad_norm": 8.028242111206055,
|
| 1084 |
+
"learning_rate": 4.983564786433763e-05,
|
| 1085 |
+
"loss": 0.9705,
|
| 1086 |
+
"step": 1520
|
| 1087 |
+
},
|
| 1088 |
+
{
|
| 1089 |
+
"epoch": 0.68,
|
| 1090 |
+
"grad_norm": 8.18526840209961,
|
| 1091 |
+
"learning_rate": 4.860881282549285e-05,
|
| 1092 |
+
"loss": 0.9802,
|
| 1093 |
+
"step": 1530
|
| 1094 |
+
},
|
| 1095 |
+
{
|
| 1096 |
+
"epoch": 0.6844444444444444,
|
| 1097 |
+
"grad_norm": 9.321311950683594,
|
| 1098 |
+
"learning_rate": 4.739240024190904e-05,
|
| 1099 |
+
"loss": 0.9649,
|
| 1100 |
+
"step": 1540
|
| 1101 |
+
},
|
| 1102 |
+
{
|
| 1103 |
+
"epoch": 0.6888888888888889,
|
| 1104 |
+
"grad_norm": 10.959417343139648,
|
| 1105 |
+
"learning_rate": 4.618665680969163e-05,
|
| 1106 |
+
"loss": 0.9957,
|
| 1107 |
+
"step": 1550
|
| 1108 |
+
},
|
| 1109 |
+
{
|
| 1110 |
+
"epoch": 0.6933333333333334,
|
| 1111 |
+
"grad_norm": 9.302586555480957,
|
| 1112 |
+
"learning_rate": 4.49918270611752e-05,
|
| 1113 |
+
"loss": 0.9833,
|
| 1114 |
+
"step": 1560
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"epoch": 0.6977777777777778,
|
| 1118 |
+
"grad_norm": 7.047448635101318,
|
| 1119 |
+
"learning_rate": 4.380815331533088e-05,
|
| 1120 |
+
"loss": 1.0179,
|
| 1121 |
+
"step": 1570
|
| 1122 |
+
},
|
| 1123 |
+
{
|
| 1124 |
+
"epoch": 0.7022222222222222,
|
| 1125 |
+
"grad_norm": 9.307101249694824,
|
| 1126 |
+
"learning_rate": 4.2635875628622345e-05,
|
| 1127 |
+
"loss": 0.9883,
|
| 1128 |
+
"step": 1580
|
| 1129 |
+
},
|
| 1130 |
+
{
|
| 1131 |
+
"epoch": 0.7066666666666667,
|
| 1132 |
+
"grad_norm": 8.306827545166016,
|
| 1133 |
+
"learning_rate": 4.147523174632103e-05,
|
| 1134 |
+
"loss": 0.984,
|
| 1135 |
+
"step": 1590
|
| 1136 |
+
},
|
| 1137 |
+
{
|
| 1138 |
+
"epoch": 0.7111111111111111,
|
| 1139 |
+
"grad_norm": 9.073155403137207,
|
| 1140 |
+
"learning_rate": 4.032645705428985e-05,
|
| 1141 |
+
"loss": 0.9916,
|
| 1142 |
+
"step": 1600
|
| 1143 |
+
},
|
| 1144 |
+
{
|
| 1145 |
+
"epoch": 0.7155555555555555,
|
| 1146 |
+
"grad_norm": 11.148294448852539,
|
| 1147 |
+
"learning_rate": 3.9189784531245334e-05,
|
| 1148 |
+
"loss": 0.993,
|
| 1149 |
+
"step": 1610
|
| 1150 |
+
},
|
| 1151 |
+
{
|
| 1152 |
+
"epoch": 0.72,
|
| 1153 |
+
"grad_norm": 7.878681659698486,
|
| 1154 |
+
"learning_rate": 3.806544470150831e-05,
|
| 1155 |
+
"loss": 0.9733,
|
| 1156 |
+
"step": 1620
|
| 1157 |
+
},
|
| 1158 |
+
{
|
| 1159 |
+
"epoch": 0.7244444444444444,
|
| 1160 |
+
"grad_norm": 9.204869270324707,
|
| 1161 |
+
"learning_rate": 3.6953665588251984e-05,
|
| 1162 |
+
"loss": 0.9689,
|
| 1163 |
+
"step": 1630
|
| 1164 |
+
},
|
| 1165 |
+
{
|
| 1166 |
+
"epoch": 0.7288888888888889,
|
| 1167 |
+
"grad_norm": 8.391727447509766,
|
| 1168 |
+
"learning_rate": 3.585467266725737e-05,
|
| 1169 |
+
"loss": 0.9782,
|
| 1170 |
+
"step": 1640
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"epoch": 0.7333333333333333,
|
| 1174 |
+
"grad_norm": 6.572085857391357,
|
| 1175 |
+
"learning_rate": 3.4768688821185566e-05,
|
| 1176 |
+
"loss": 0.9548,
|
| 1177 |
+
"step": 1650
|
| 1178 |
+
},
|
| 1179 |
+
{
|
| 1180 |
+
"epoch": 0.7377777777777778,
|
| 1181 |
+
"grad_norm": 9.943083763122559,
|
| 1182 |
+
"learning_rate": 3.3695934294375544e-05,
|
| 1183 |
+
"loss": 0.9904,
|
| 1184 |
+
"step": 1660
|
| 1185 |
+
},
|
| 1186 |
+
{
|
| 1187 |
+
"epoch": 0.7422222222222222,
|
| 1188 |
+
"grad_norm": 8.165312767028809,
|
| 1189 |
+
"learning_rate": 3.263662664817728e-05,
|
| 1190 |
+
"loss": 0.9728,
|
| 1191 |
+
"step": 1670
|
| 1192 |
+
},
|
| 1193 |
+
{
|
| 1194 |
+
"epoch": 0.7466666666666667,
|
| 1195 |
+
"grad_norm": 9.635257720947266,
|
| 1196 |
+
"learning_rate": 3.15909807168291e-05,
|
| 1197 |
+
"loss": 0.961,
|
| 1198 |
+
"step": 1680
|
| 1199 |
+
},
|
| 1200 |
+
{
|
| 1201 |
+
"epoch": 0.7511111111111111,
|
| 1202 |
+
"grad_norm": 7.636417865753174,
|
| 1203 |
+
"learning_rate": 3.055920856388779e-05,
|
| 1204 |
+
"loss": 0.9403,
|
| 1205 |
+
"step": 1690
|
| 1206 |
+
},
|
| 1207 |
+
{
|
| 1208 |
+
"epoch": 0.7555555555555555,
|
| 1209 |
+
"grad_norm": 6.770568370819092,
|
| 1210 |
+
"learning_rate": 2.95415194392207e-05,
|
| 1211 |
+
"loss": 0.9484,
|
| 1212 |
+
"step": 1700
|
| 1213 |
+
},
|
| 1214 |
+
{
|
| 1215 |
+
"epoch": 0.76,
|
| 1216 |
+
"grad_norm": 7.254674434661865,
|
| 1217 |
+
"learning_rate": 2.8538119736568845e-05,
|
| 1218 |
+
"loss": 0.9701,
|
| 1219 |
+
"step": 1710
|
| 1220 |
+
},
|
| 1221 |
+
{
|
| 1222 |
+
"epoch": 0.7644444444444445,
|
| 1223 |
+
"grad_norm": 8.287463188171387,
|
| 1224 |
+
"learning_rate": 2.7549212951688598e-05,
|
| 1225 |
+
"loss": 0.9591,
|
| 1226 |
+
"step": 1720
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"epoch": 0.7688888888888888,
|
| 1230 |
+
"grad_norm": 8.489920616149902,
|
| 1231 |
+
"learning_rate": 2.6574999641081812e-05,
|
| 1232 |
+
"loss": 0.9285,
|
| 1233 |
+
"step": 1730
|
| 1234 |
+
},
|
| 1235 |
+
{
|
| 1236 |
+
"epoch": 0.7733333333333333,
|
| 1237 |
+
"grad_norm": 7.725697994232178,
|
| 1238 |
+
"learning_rate": 2.561567738132149e-05,
|
| 1239 |
+
"loss": 0.8912,
|
| 1240 |
+
"step": 1740
|
| 1241 |
+
},
|
| 1242 |
+
{
|
| 1243 |
+
"epoch": 0.7777777777777778,
|
| 1244 |
+
"grad_norm": 8.986964225769043,
|
| 1245 |
+
"learning_rate": 2.467144072898202e-05,
|
| 1246 |
+
"loss": 0.9386,
|
| 1247 |
+
"step": 1750
|
| 1248 |
+
},
|
| 1249 |
+
{
|
| 1250 |
+
"epoch": 0.7822222222222223,
|
| 1251 |
+
"grad_norm": 8.926631927490234,
|
| 1252 |
+
"learning_rate": 2.3742481181182065e-05,
|
| 1253 |
+
"loss": 0.9224,
|
| 1254 |
+
"step": 1760
|
| 1255 |
+
},
|
| 1256 |
+
{
|
| 1257 |
+
"epoch": 0.7866666666666666,
|
| 1258 |
+
"grad_norm": 7.921815395355225,
|
| 1259 |
+
"learning_rate": 2.2828987136747505e-05,
|
| 1260 |
+
"loss": 0.9393,
|
| 1261 |
+
"step": 1770
|
| 1262 |
+
},
|
| 1263 |
+
{
|
| 1264 |
+
"epoch": 0.7911111111111111,
|
| 1265 |
+
"grad_norm": 6.680901050567627,
|
| 1266 |
+
"learning_rate": 2.193114385800309e-05,
|
| 1267 |
+
"loss": 0.9359,
|
| 1268 |
+
"step": 1780
|
| 1269 |
+
},
|
| 1270 |
+
{
|
| 1271 |
+
"epoch": 0.7955555555555556,
|
| 1272 |
+
"grad_norm": 6.957186698913574,
|
| 1273 |
+
"learning_rate": 2.104913343320013e-05,
|
| 1274 |
+
"loss": 0.9285,
|
| 1275 |
+
"step": 1790
|
| 1276 |
+
},
|
| 1277 |
+
{
|
| 1278 |
+
"epoch": 0.8,
|
| 1279 |
+
"grad_norm": 7.6232008934021,
|
| 1280 |
+
"learning_rate": 2.0183134739587807e-05,
|
| 1281 |
+
"loss": 0.9083,
|
| 1282 |
+
"step": 1800
|
| 1283 |
}
|
| 1284 |
],
|
| 1285 |
"logging_steps": 10,
|
|
|
|
| 1299 |
"attributes": {}
|
| 1300 |
}
|
| 1301 |
},
|
| 1302 |
+
"total_flos": 4.9102593196032e+19,
|
| 1303 |
"train_batch_size": 4,
|
| 1304 |
"trial_name": null,
|
| 1305 |
"trial_params": null
|