8BitStudio commited on
Commit
ef8da5e
·
verified ·
1 Parent(s): 612da55

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa2fa49e5ab01e8388f884f001a6fef59415f0afcdf8851cf32b99cba1b66f98
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e74cac81df1d9f55b850794a03cd64fce4492c0c0da5d81e9909dae9911f943
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7bd1f1004e066807e00b62878ad4b49df433de186c64f0a97f9237a03eb281b
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bf3be67603d9aa1f5d666b6a508c045b0cbd46af1138c22216863f18d284cfb
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2ea0240538fb238def027691182a688f4848085d98c59d8205c56a6ab84887c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:592f06f7337b836b66cd80a06e6dc9e25ae533b97c6347eb9344f6ecddefa9aa
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2054ec2901370b6a537467b9fa82f13f962dc91e80e60e56cd6658a9567a46a8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a081bc5da5ed0dc09d1d00741d1fe6bdeae12f8d58e5b4d44a7d78e0ad120f04
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0121748633879784,
6
  "eval_steps": 500,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1128,6 +1128,286 @@
1128
  "learning_rate": 0.00029708649617388356,
1129
  "loss": 2.0629,
1130
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1131
  }
1132
  ],
1133
  "logging_steps": 50,
@@ -1147,7 +1427,7 @@
1147
  "attributes": {}
1148
  }
1149
  },
1150
- "total_flos": 4.278236395534811e+18,
1151
  "train_batch_size": 16,
1152
  "trial_name": null,
1153
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0340327868852457,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1128
  "learning_rate": 0.00029708649617388356,
1129
  "loss": 2.0629,
1130
  "step": 8000
1131
+ },
1132
+ {
1133
+ "epoch": 2.01272131147541,
1134
+ "grad_norm": 0.515625,
1135
+ "learning_rate": 0.0002970344005912617,
1136
+ "loss": 2.0588,
1137
+ "step": 8050
1138
+ },
1139
+ {
1140
+ "epoch": 2.0132677595628414,
1141
+ "grad_norm": 0.4765625,
1142
+ "learning_rate": 0.000296981848022331,
1143
+ "loss": 2.0373,
1144
+ "step": 8100
1145
+ },
1146
+ {
1147
+ "epoch": 2.0138142076502734,
1148
+ "grad_norm": 0.44921875,
1149
+ "learning_rate": 0.000296928838630426,
1150
+ "loss": 2.0348,
1151
+ "step": 8150
1152
+ },
1153
+ {
1154
+ "epoch": 2.014360655737705,
1155
+ "grad_norm": 0.490234375,
1156
+ "learning_rate": 0.0002968753725803013,
1157
+ "loss": 2.0784,
1158
+ "step": 8200
1159
+ },
1160
+ {
1161
+ "epoch": 2.0149071038251365,
1162
+ "grad_norm": 0.5390625,
1163
+ "learning_rate": 0.0002968214500381304,
1164
+ "loss": 2.0531,
1165
+ "step": 8250
1166
+ },
1167
+ {
1168
+ "epoch": 2.0154535519125685,
1169
+ "grad_norm": 0.546875,
1170
+ "learning_rate": 0.000296767071171506,
1171
+ "loss": 2.0482,
1172
+ "step": 8300
1173
+ },
1174
+ {
1175
+ "epoch": 2.016,
1176
+ "grad_norm": 0.484375,
1177
+ "learning_rate": 0.00029671223614943874,
1178
+ "loss": 2.0193,
1179
+ "step": 8350
1180
+ },
1181
+ {
1182
+ "epoch": 2.0165464480874316,
1183
+ "grad_norm": 0.4765625,
1184
+ "learning_rate": 0.0002966569451423572,
1185
+ "loss": 2.007,
1186
+ "step": 8400
1187
+ },
1188
+ {
1189
+ "epoch": 2.0170928961748635,
1190
+ "grad_norm": 0.48828125,
1191
+ "learning_rate": 0.000296601198322107,
1192
+ "loss": 2.0325,
1193
+ "step": 8450
1194
+ },
1195
+ {
1196
+ "epoch": 2.017639344262295,
1197
+ "grad_norm": 0.4921875,
1198
+ "learning_rate": 0.0002965449958619508,
1199
+ "loss": 2.0173,
1200
+ "step": 8500
1201
+ },
1202
+ {
1203
+ "epoch": 2.0181857923497266,
1204
+ "grad_norm": 0.51953125,
1205
+ "learning_rate": 0.0002964883379365668,
1206
+ "loss": 1.9927,
1207
+ "step": 8550
1208
+ },
1209
+ {
1210
+ "epoch": 2.0187322404371586,
1211
+ "grad_norm": 0.546875,
1212
+ "learning_rate": 0.00029643122472204934,
1213
+ "loss": 2.0149,
1214
+ "step": 8600
1215
+ },
1216
+ {
1217
+ "epoch": 2.01927868852459,
1218
+ "grad_norm": 0.49609375,
1219
+ "learning_rate": 0.00029637365639590763,
1220
+ "loss": 2.0077,
1221
+ "step": 8650
1222
+ },
1223
+ {
1224
+ "epoch": 2.0198251366120217,
1225
+ "grad_norm": 0.5390625,
1226
+ "learning_rate": 0.00029631563313706525,
1227
+ "loss": 1.9926,
1228
+ "step": 8700
1229
+ },
1230
+ {
1231
+ "epoch": 2.0203715846994537,
1232
+ "grad_norm": 0.494140625,
1233
+ "learning_rate": 0.0002962571551258599,
1234
+ "loss": 2.0248,
1235
+ "step": 8750
1236
+ },
1237
+ {
1238
+ "epoch": 2.020918032786885,
1239
+ "grad_norm": 0.51953125,
1240
+ "learning_rate": 0.00029619822254404256,
1241
+ "loss": 1.998,
1242
+ "step": 8800
1243
+ },
1244
+ {
1245
+ "epoch": 2.0214644808743167,
1246
+ "grad_norm": 0.478515625,
1247
+ "learning_rate": 0.00029613883557477706,
1248
+ "loss": 1.9957,
1249
+ "step": 8850
1250
+ },
1251
+ {
1252
+ "epoch": 2.0220109289617487,
1253
+ "grad_norm": 0.49609375,
1254
+ "learning_rate": 0.00029607899440263946,
1255
+ "loss": 2.0132,
1256
+ "step": 8900
1257
+ },
1258
+ {
1259
+ "epoch": 2.0225573770491803,
1260
+ "grad_norm": 0.5390625,
1261
+ "learning_rate": 0.00029601869921361756,
1262
+ "loss": 2.0038,
1263
+ "step": 8950
1264
+ },
1265
+ {
1266
+ "epoch": 2.0231038251366122,
1267
+ "grad_norm": 0.458984375,
1268
+ "learning_rate": 0.00029595795019511005,
1269
+ "loss": 1.9447,
1270
+ "step": 9000
1271
+ },
1272
+ {
1273
+ "epoch": 2.023650273224044,
1274
+ "grad_norm": 0.5078125,
1275
+ "learning_rate": 0.00029589674753592647,
1276
+ "loss": 1.9806,
1277
+ "step": 9050
1278
+ },
1279
+ {
1280
+ "epoch": 2.0241967213114753,
1281
+ "grad_norm": 0.5546875,
1282
+ "learning_rate": 0.000295835091426286,
1283
+ "loss": 1.9738,
1284
+ "step": 9100
1285
+ },
1286
+ {
1287
+ "epoch": 2.0247431693989073,
1288
+ "grad_norm": 0.5078125,
1289
+ "learning_rate": 0.00029577298205781726,
1290
+ "loss": 1.9568,
1291
+ "step": 9150
1292
+ },
1293
+ {
1294
+ "epoch": 2.025289617486339,
1295
+ "grad_norm": 0.5078125,
1296
+ "learning_rate": 0.00029571041962355755,
1297
+ "loss": 1.9778,
1298
+ "step": 9200
1299
+ },
1300
+ {
1301
+ "epoch": 2.0258360655737704,
1302
+ "grad_norm": 0.5078125,
1303
+ "learning_rate": 0.0002956474043179525,
1304
+ "loss": 2.0212,
1305
+ "step": 9250
1306
+ },
1307
+ {
1308
+ "epoch": 2.0263825136612024,
1309
+ "grad_norm": 0.51953125,
1310
+ "learning_rate": 0.0002955839363368549,
1311
+ "loss": 1.9981,
1312
+ "step": 9300
1313
+ },
1314
+ {
1315
+ "epoch": 2.026928961748634,
1316
+ "grad_norm": 0.5,
1317
+ "learning_rate": 0.00029552001587752495,
1318
+ "loss": 1.9763,
1319
+ "step": 9350
1320
+ },
1321
+ {
1322
+ "epoch": 2.0274754098360654,
1323
+ "grad_norm": 0.515625,
1324
+ "learning_rate": 0.0002954556431386288,
1325
+ "loss": 1.9687,
1326
+ "step": 9400
1327
+ },
1328
+ {
1329
+ "epoch": 2.0280218579234974,
1330
+ "grad_norm": 0.5390625,
1331
+ "learning_rate": 0.00029539081832023837,
1332
+ "loss": 1.9391,
1333
+ "step": 9450
1334
+ },
1335
+ {
1336
+ "epoch": 2.028568306010929,
1337
+ "grad_norm": 0.56640625,
1338
+ "learning_rate": 0.0002953255416238308,
1339
+ "loss": 1.9614,
1340
+ "step": 9500
1341
+ },
1342
+ {
1343
+ "epoch": 2.0291147540983605,
1344
+ "grad_norm": 0.490234375,
1345
+ "learning_rate": 0.0002952598132522874,
1346
+ "loss": 1.9405,
1347
+ "step": 9550
1348
+ },
1349
+ {
1350
+ "epoch": 2.0296612021857925,
1351
+ "grad_norm": 0.546875,
1352
+ "learning_rate": 0.00029519363340989367,
1353
+ "loss": 1.9653,
1354
+ "step": 9600
1355
+ },
1356
+ {
1357
+ "epoch": 2.030207650273224,
1358
+ "grad_norm": 0.50390625,
1359
+ "learning_rate": 0.0002951270023023379,
1360
+ "loss": 1.9704,
1361
+ "step": 9650
1362
+ },
1363
+ {
1364
+ "epoch": 2.0307540983606556,
1365
+ "grad_norm": 0.53515625,
1366
+ "learning_rate": 0.00029505992013671126,
1367
+ "loss": 1.9592,
1368
+ "step": 9700
1369
+ },
1370
+ {
1371
+ "epoch": 2.0313005464480876,
1372
+ "grad_norm": 0.474609375,
1373
+ "learning_rate": 0.0002949923871215065,
1374
+ "loss": 1.9679,
1375
+ "step": 9750
1376
+ },
1377
+ {
1378
+ "epoch": 2.031846994535519,
1379
+ "grad_norm": 0.5546875,
1380
+ "learning_rate": 0.000294924403466618,
1381
+ "loss": 1.9398,
1382
+ "step": 9800
1383
+ },
1384
+ {
1385
+ "epoch": 2.0323934426229506,
1386
+ "grad_norm": 0.5625,
1387
+ "learning_rate": 0.00029485596938334037,
1388
+ "loss": 1.9469,
1389
+ "step": 9850
1390
+ },
1391
+ {
1392
+ "epoch": 2.0329398907103826,
1393
+ "grad_norm": 0.51953125,
1394
+ "learning_rate": 0.00029478708508436834,
1395
+ "loss": 1.9742,
1396
+ "step": 9900
1397
+ },
1398
+ {
1399
+ "epoch": 2.033486338797814,
1400
+ "grad_norm": 0.482421875,
1401
+ "learning_rate": 0.000294717750783796,
1402
+ "loss": 1.9619,
1403
+ "step": 9950
1404
+ },
1405
+ {
1406
+ "epoch": 2.0340327868852457,
1407
+ "grad_norm": 0.6015625,
1408
+ "learning_rate": 0.0002946479666971158,
1409
+ "loss": 1.8817,
1410
+ "step": 10000
1411
  }
1412
  ],
1413
  "logging_steps": 50,
 
1427
  "attributes": {}
1428
  }
1429
  },
1430
+ "total_flos": 5.347720296331739e+18,
1431
  "train_batch_size": 16,
1432
  "trial_name": null,
1433
  "trial_params": null