Training in progress, step 2000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 328277848
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b30006c3c8ebdd220eda160d67d570192e678e4b938a46729d63d00fc226c89
|
| 3 |
size 328277848
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 318646859
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d83b910297466c079691649d9d51db171a5eff2b984ed10840ddd4d5cf17b1d
|
| 3 |
size 318646859
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8647979d889bb2b15d0a3e8961a7e547be28d07767d240f858bd959476bb870c
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a6e444c46ec49de792e4afbe9af4aa4613bca60425da2b0ac2cae225e516fcc
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1089,6 +1089,364 @@
|
|
| 1089 |
"eval_samples_per_second": 277.282,
|
| 1090 |
"eval_steps_per_second": 5.823,
|
| 1091 |
"step": 1500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1092 |
}
|
| 1093 |
],
|
| 1094 |
"logging_steps": 10,
|
|
@@ -1108,7 +1466,7 @@
|
|
| 1108 |
"attributes": {}
|
| 1109 |
}
|
| 1110 |
},
|
| 1111 |
-
"total_flos":
|
| 1112 |
"train_batch_size": 48,
|
| 1113 |
"trial_name": null,
|
| 1114 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.33789491468153404,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 2000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1089 |
"eval_samples_per_second": 277.282,
|
| 1090 |
"eval_steps_per_second": 5.823,
|
| 1091 |
"step": 1500
|
| 1092 |
+
},
|
| 1093 |
+
{
|
| 1094 |
+
"epoch": 0.2551106605845582,
|
| 1095 |
+
"grad_norm": 1.3389363288879395,
|
| 1096 |
+
"learning_rate": 0.00022634999999999997,
|
| 1097 |
+
"loss": 6.027260589599609,
|
| 1098 |
+
"step": 1510
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"epoch": 0.25680013515796585,
|
| 1102 |
+
"grad_norm": 1.2689851522445679,
|
| 1103 |
+
"learning_rate": 0.00022784999999999995,
|
| 1104 |
+
"loss": 6.00293083190918,
|
| 1105 |
+
"step": 1520
|
| 1106 |
+
},
|
| 1107 |
+
{
|
| 1108 |
+
"epoch": 0.2584896097313735,
|
| 1109 |
+
"grad_norm": 1.4860210418701172,
|
| 1110 |
+
"learning_rate": 0.00022934999999999996,
|
| 1111 |
+
"loss": 5.998868942260742,
|
| 1112 |
+
"step": 1530
|
| 1113 |
+
},
|
| 1114 |
+
{
|
| 1115 |
+
"epoch": 0.2601790843047812,
|
| 1116 |
+
"grad_norm": 1.2490425109863281,
|
| 1117 |
+
"learning_rate": 0.00023084999999999997,
|
| 1118 |
+
"loss": 5.984478759765625,
|
| 1119 |
+
"step": 1540
|
| 1120 |
+
},
|
| 1121 |
+
{
|
| 1122 |
+
"epoch": 0.2618685588781889,
|
| 1123 |
+
"grad_norm": 1.5586382150650024,
|
| 1124 |
+
"learning_rate": 0.00023234999999999998,
|
| 1125 |
+
"loss": 5.9672401428222654,
|
| 1126 |
+
"step": 1550
|
| 1127 |
+
},
|
| 1128 |
+
{
|
| 1129 |
+
"epoch": 0.26355803345159656,
|
| 1130 |
+
"grad_norm": 1.3526853322982788,
|
| 1131 |
+
"learning_rate": 0.00023384999999999997,
|
| 1132 |
+
"loss": 5.982438278198242,
|
| 1133 |
+
"step": 1560
|
| 1134 |
+
},
|
| 1135 |
+
{
|
| 1136 |
+
"epoch": 0.26524750802500424,
|
| 1137 |
+
"grad_norm": 1.3406753540039062,
|
| 1138 |
+
"learning_rate": 0.00023534999999999997,
|
| 1139 |
+
"loss": 5.938652801513672,
|
| 1140 |
+
"step": 1570
|
| 1141 |
+
},
|
| 1142 |
+
{
|
| 1143 |
+
"epoch": 0.2669369825984119,
|
| 1144 |
+
"grad_norm": 1.0397038459777832,
|
| 1145 |
+
"learning_rate": 0.00023684999999999998,
|
| 1146 |
+
"loss": 5.920218658447266,
|
| 1147 |
+
"step": 1580
|
| 1148 |
+
},
|
| 1149 |
+
{
|
| 1150 |
+
"epoch": 0.26862645717181954,
|
| 1151 |
+
"grad_norm": 1.7000986337661743,
|
| 1152 |
+
"learning_rate": 0.00023834999999999997,
|
| 1153 |
+
"loss": 5.896316146850586,
|
| 1154 |
+
"step": 1590
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"epoch": 0.2703159317452272,
|
| 1158 |
+
"grad_norm": 1.1729341745376587,
|
| 1159 |
+
"learning_rate": 0.00023984999999999998,
|
| 1160 |
+
"loss": 5.8752281188964846,
|
| 1161 |
+
"step": 1600
|
| 1162 |
+
},
|
| 1163 |
+
{
|
| 1164 |
+
"epoch": 0.2720054063186349,
|
| 1165 |
+
"grad_norm": 1.3115921020507812,
|
| 1166 |
+
"learning_rate": 0.00024134999999999998,
|
| 1167 |
+
"loss": 5.877028274536133,
|
| 1168 |
+
"step": 1610
|
| 1169 |
+
},
|
| 1170 |
+
{
|
| 1171 |
+
"epoch": 0.2736948808920426,
|
| 1172 |
+
"grad_norm": 1.5481823682785034,
|
| 1173 |
+
"learning_rate": 0.00024284999999999997,
|
| 1174 |
+
"loss": 5.863247299194336,
|
| 1175 |
+
"step": 1620
|
| 1176 |
+
},
|
| 1177 |
+
{
|
| 1178 |
+
"epoch": 0.27538435546545026,
|
| 1179 |
+
"grad_norm": 1.4173649549484253,
|
| 1180 |
+
"learning_rate": 0.00024435,
|
| 1181 |
+
"loss": 5.848538970947265,
|
| 1182 |
+
"step": 1630
|
| 1183 |
+
},
|
| 1184 |
+
{
|
| 1185 |
+
"epoch": 0.27707383003885794,
|
| 1186 |
+
"grad_norm": 1.2587963342666626,
|
| 1187 |
+
"learning_rate": 0.00024585,
|
| 1188 |
+
"loss": 5.841713333129883,
|
| 1189 |
+
"step": 1640
|
| 1190 |
+
},
|
| 1191 |
+
{
|
| 1192 |
+
"epoch": 0.27876330461226556,
|
| 1193 |
+
"grad_norm": 1.0922702550888062,
|
| 1194 |
+
"learning_rate": 0.00024734999999999997,
|
| 1195 |
+
"loss": 5.8486980438232425,
|
| 1196 |
+
"step": 1650
|
| 1197 |
+
},
|
| 1198 |
+
{
|
| 1199 |
+
"epoch": 0.28045277918567324,
|
| 1200 |
+
"grad_norm": 1.6068239212036133,
|
| 1201 |
+
"learning_rate": 0.00024885,
|
| 1202 |
+
"loss": 5.819171142578125,
|
| 1203 |
+
"step": 1660
|
| 1204 |
+
},
|
| 1205 |
+
{
|
| 1206 |
+
"epoch": 0.2821422537590809,
|
| 1207 |
+
"grad_norm": 1.5260576009750366,
|
| 1208 |
+
"learning_rate": 0.00025035,
|
| 1209 |
+
"loss": 5.809968566894531,
|
| 1210 |
+
"step": 1670
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"epoch": 0.2838317283324886,
|
| 1214 |
+
"grad_norm": 1.2246356010437012,
|
| 1215 |
+
"learning_rate": 0.00025184999999999997,
|
| 1216 |
+
"loss": 5.788796997070312,
|
| 1217 |
+
"step": 1680
|
| 1218 |
+
},
|
| 1219 |
+
{
|
| 1220 |
+
"epoch": 0.2855212029058963,
|
| 1221 |
+
"grad_norm": 1.0366030931472778,
|
| 1222 |
+
"learning_rate": 0.00025335,
|
| 1223 |
+
"loss": 5.78180160522461,
|
| 1224 |
+
"step": 1690
|
| 1225 |
+
},
|
| 1226 |
+
{
|
| 1227 |
+
"epoch": 0.28721067747930396,
|
| 1228 |
+
"grad_norm": 1.2072358131408691,
|
| 1229 |
+
"learning_rate": 0.00025485,
|
| 1230 |
+
"loss": 5.770789337158203,
|
| 1231 |
+
"step": 1700
|
| 1232 |
+
},
|
| 1233 |
+
{
|
| 1234 |
+
"epoch": 0.28890015205271163,
|
| 1235 |
+
"grad_norm": 1.3359684944152832,
|
| 1236 |
+
"learning_rate": 0.00025634999999999997,
|
| 1237 |
+
"loss": 5.737417221069336,
|
| 1238 |
+
"step": 1710
|
| 1239 |
+
},
|
| 1240 |
+
{
|
| 1241 |
+
"epoch": 0.29058962662611926,
|
| 1242 |
+
"grad_norm": 1.355406403541565,
|
| 1243 |
+
"learning_rate": 0.00025785,
|
| 1244 |
+
"loss": 5.725430297851562,
|
| 1245 |
+
"step": 1720
|
| 1246 |
+
},
|
| 1247 |
+
{
|
| 1248 |
+
"epoch": 0.29227910119952694,
|
| 1249 |
+
"grad_norm": 1.1998307704925537,
|
| 1250 |
+
"learning_rate": 0.00025935,
|
| 1251 |
+
"loss": 5.723165130615234,
|
| 1252 |
+
"step": 1730
|
| 1253 |
+
},
|
| 1254 |
+
{
|
| 1255 |
+
"epoch": 0.2939685757729346,
|
| 1256 |
+
"grad_norm": 1.0525386333465576,
|
| 1257 |
+
"learning_rate": 0.00026084999999999997,
|
| 1258 |
+
"loss": 5.720573043823242,
|
| 1259 |
+
"step": 1740
|
| 1260 |
+
},
|
| 1261 |
+
{
|
| 1262 |
+
"epoch": 0.2956580503463423,
|
| 1263 |
+
"grad_norm": 1.2880501747131348,
|
| 1264 |
+
"learning_rate": 0.00026235,
|
| 1265 |
+
"loss": 5.684521102905274,
|
| 1266 |
+
"step": 1750
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"epoch": 0.29734752491975,
|
| 1270 |
+
"grad_norm": 1.2246838808059692,
|
| 1271 |
+
"learning_rate": 0.00026384999999999994,
|
| 1272 |
+
"loss": 5.670655059814453,
|
| 1273 |
+
"step": 1760
|
| 1274 |
+
},
|
| 1275 |
+
{
|
| 1276 |
+
"epoch": 0.29903699949315765,
|
| 1277 |
+
"grad_norm": 1.2167463302612305,
|
| 1278 |
+
"learning_rate": 0.00026534999999999997,
|
| 1279 |
+
"loss": 5.690992736816407,
|
| 1280 |
+
"step": 1770
|
| 1281 |
+
},
|
| 1282 |
+
{
|
| 1283 |
+
"epoch": 0.3007264740665653,
|
| 1284 |
+
"grad_norm": 1.2467341423034668,
|
| 1285 |
+
"learning_rate": 0.00026684999999999995,
|
| 1286 |
+
"loss": 5.694464492797851,
|
| 1287 |
+
"step": 1780
|
| 1288 |
+
},
|
| 1289 |
+
{
|
| 1290 |
+
"epoch": 0.30241594863997295,
|
| 1291 |
+
"grad_norm": 1.2740100622177124,
|
| 1292 |
+
"learning_rate": 0.00026835,
|
| 1293 |
+
"loss": 5.679082870483398,
|
| 1294 |
+
"step": 1790
|
| 1295 |
+
},
|
| 1296 |
+
{
|
| 1297 |
+
"epoch": 0.30410542321338063,
|
| 1298 |
+
"grad_norm": 1.2217073440551758,
|
| 1299 |
+
"learning_rate": 0.00026984999999999997,
|
| 1300 |
+
"loss": 5.650615692138672,
|
| 1301 |
+
"step": 1800
|
| 1302 |
+
},
|
| 1303 |
+
{
|
| 1304 |
+
"epoch": 0.3057948977867883,
|
| 1305 |
+
"grad_norm": 1.1172698736190796,
|
| 1306 |
+
"learning_rate": 0.00027134999999999995,
|
| 1307 |
+
"loss": 5.651753234863281,
|
| 1308 |
+
"step": 1810
|
| 1309 |
+
},
|
| 1310 |
+
{
|
| 1311 |
+
"epoch": 0.307484372360196,
|
| 1312 |
+
"grad_norm": 1.1706960201263428,
|
| 1313 |
+
"learning_rate": 0.00027285,
|
| 1314 |
+
"loss": 5.6512096405029295,
|
| 1315 |
+
"step": 1820
|
| 1316 |
+
},
|
| 1317 |
+
{
|
| 1318 |
+
"epoch": 0.30917384693360367,
|
| 1319 |
+
"grad_norm": 0.91384357213974,
|
| 1320 |
+
"learning_rate": 0.00027435,
|
| 1321 |
+
"loss": 5.63836784362793,
|
| 1322 |
+
"step": 1830
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"epoch": 0.3108633215070113,
|
| 1326 |
+
"grad_norm": 1.1929048299789429,
|
| 1327 |
+
"learning_rate": 0.00027584999999999996,
|
| 1328 |
+
"loss": 5.628775787353516,
|
| 1329 |
+
"step": 1840
|
| 1330 |
+
},
|
| 1331 |
+
{
|
| 1332 |
+
"epoch": 0.31255279608041897,
|
| 1333 |
+
"grad_norm": 1.023672103881836,
|
| 1334 |
+
"learning_rate": 0.00027735,
|
| 1335 |
+
"loss": 5.616031265258789,
|
| 1336 |
+
"step": 1850
|
| 1337 |
+
},
|
| 1338 |
+
{
|
| 1339 |
+
"epoch": 0.31424227065382665,
|
| 1340 |
+
"grad_norm": 1.1450271606445312,
|
| 1341 |
+
"learning_rate": 0.00027885,
|
| 1342 |
+
"loss": 5.612253952026367,
|
| 1343 |
+
"step": 1860
|
| 1344 |
+
},
|
| 1345 |
+
{
|
| 1346 |
+
"epoch": 0.31593174522723433,
|
| 1347 |
+
"grad_norm": 1.0316193103790283,
|
| 1348 |
+
"learning_rate": 0.00028034999999999996,
|
| 1349 |
+
"loss": 5.577928161621093,
|
| 1350 |
+
"step": 1870
|
| 1351 |
+
},
|
| 1352 |
+
{
|
| 1353 |
+
"epoch": 0.317621219800642,
|
| 1354 |
+
"grad_norm": 1.1516318321228027,
|
| 1355 |
+
"learning_rate": 0.00028185,
|
| 1356 |
+
"loss": 5.589142227172852,
|
| 1357 |
+
"step": 1880
|
| 1358 |
+
},
|
| 1359 |
+
{
|
| 1360 |
+
"epoch": 0.3193106943740497,
|
| 1361 |
+
"grad_norm": 1.426249384880066,
|
| 1362 |
+
"learning_rate": 0.00028335,
|
| 1363 |
+
"loss": 5.594329071044922,
|
| 1364 |
+
"step": 1890
|
| 1365 |
+
},
|
| 1366 |
+
{
|
| 1367 |
+
"epoch": 0.32100016894745736,
|
| 1368 |
+
"grad_norm": 1.0666186809539795,
|
| 1369 |
+
"learning_rate": 0.00028484999999999996,
|
| 1370 |
+
"loss": 5.582658386230468,
|
| 1371 |
+
"step": 1900
|
| 1372 |
+
},
|
| 1373 |
+
{
|
| 1374 |
+
"epoch": 0.322689643520865,
|
| 1375 |
+
"grad_norm": 0.8879145979881287,
|
| 1376 |
+
"learning_rate": 0.00028635,
|
| 1377 |
+
"loss": 5.542075347900391,
|
| 1378 |
+
"step": 1910
|
| 1379 |
+
},
|
| 1380 |
+
{
|
| 1381 |
+
"epoch": 0.32437911809427267,
|
| 1382 |
+
"grad_norm": 1.2985228300094604,
|
| 1383 |
+
"learning_rate": 0.00028785,
|
| 1384 |
+
"loss": 5.572188949584961,
|
| 1385 |
+
"step": 1920
|
| 1386 |
+
},
|
| 1387 |
+
{
|
| 1388 |
+
"epoch": 0.32606859266768035,
|
| 1389 |
+
"grad_norm": 1.1801198720932007,
|
| 1390 |
+
"learning_rate": 0.00028934999999999996,
|
| 1391 |
+
"loss": 5.531465530395508,
|
| 1392 |
+
"step": 1930
|
| 1393 |
+
},
|
| 1394 |
+
{
|
| 1395 |
+
"epoch": 0.327758067241088,
|
| 1396 |
+
"grad_norm": 1.3345341682434082,
|
| 1397 |
+
"learning_rate": 0.00029085,
|
| 1398 |
+
"loss": 5.5121315002441404,
|
| 1399 |
+
"step": 1940
|
| 1400 |
+
},
|
| 1401 |
+
{
|
| 1402 |
+
"epoch": 0.3294475418144957,
|
| 1403 |
+
"grad_norm": 0.9832890629768372,
|
| 1404 |
+
"learning_rate": 0.00029235,
|
| 1405 |
+
"loss": 5.515644073486328,
|
| 1406 |
+
"step": 1950
|
| 1407 |
+
},
|
| 1408 |
+
{
|
| 1409 |
+
"epoch": 0.3311370163879034,
|
| 1410 |
+
"grad_norm": 1.379388689994812,
|
| 1411 |
+
"learning_rate": 0.00029384999999999996,
|
| 1412 |
+
"loss": 5.5223854064941404,
|
| 1413 |
+
"step": 1960
|
| 1414 |
+
},
|
| 1415 |
+
{
|
| 1416 |
+
"epoch": 0.332826490961311,
|
| 1417 |
+
"grad_norm": 1.0441769361495972,
|
| 1418 |
+
"learning_rate": 0.00029535,
|
| 1419 |
+
"loss": 5.502047729492188,
|
| 1420 |
+
"step": 1970
|
| 1421 |
+
},
|
| 1422 |
+
{
|
| 1423 |
+
"epoch": 0.3345159655347187,
|
| 1424 |
+
"grad_norm": 1.0386887788772583,
|
| 1425 |
+
"learning_rate": 0.00029685,
|
| 1426 |
+
"loss": 5.521197128295898,
|
| 1427 |
+
"step": 1980
|
| 1428 |
+
},
|
| 1429 |
+
{
|
| 1430 |
+
"epoch": 0.33620544010812636,
|
| 1431 |
+
"grad_norm": 0.8223176598548889,
|
| 1432 |
+
"learning_rate": 0.00029835,
|
| 1433 |
+
"loss": 5.479276275634765,
|
| 1434 |
+
"step": 1990
|
| 1435 |
+
},
|
| 1436 |
+
{
|
| 1437 |
+
"epoch": 0.33789491468153404,
|
| 1438 |
+
"grad_norm": 1.2531520128250122,
|
| 1439 |
+
"learning_rate": 0.00029985,
|
| 1440 |
+
"loss": 5.487053298950196,
|
| 1441 |
+
"step": 2000
|
| 1442 |
+
},
|
| 1443 |
+
{
|
| 1444 |
+
"epoch": 0.33789491468153404,
|
| 1445 |
+
"eval_loss": 5.460203170776367,
|
| 1446 |
+
"eval_runtime": 3.9099,
|
| 1447 |
+
"eval_samples_per_second": 255.761,
|
| 1448 |
+
"eval_steps_per_second": 5.371,
|
| 1449 |
+
"step": 2000
|
| 1450 |
}
|
| 1451 |
],
|
| 1452 |
"logging_steps": 10,
|
|
|
|
| 1466 |
"attributes": {}
|
| 1467 |
}
|
| 1468 |
},
|
| 1469 |
+
"total_flos": 6.6891364171776e+16,
|
| 1470 |
"train_batch_size": 48,
|
| 1471 |
"trial_name": null,
|
| 1472 |
"trial_params": null
|