mohammadmahdinouri commited on
Commit
ce05ea7
·
verified ·
1 Parent(s): c57f650

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4145daebf514960424426dde3ac18791eecc28556f940d2f1279e7831c514654
3
  size 448472762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8c3c2489afee023fe1b2642034587a0bfba7f9b9702eb8b912fd843cd7d1a84
3
  size 448472762
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b75ae1e75120c33ab2fa9bf933b8449d15710df9b78f94fa73c4c0f186c09c47
3
  size 151589028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b2b3370508976fd8007ef90e4249ff2ce2f69eb4456cd0ada442a2a9748885
3
  size 151589028
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c45bff611205a48d357012be58601cad8d52180e6ba8ae7b9b1ca21b9d659d0
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38749652e18bcce614c97ce9bdf3fcf3e27d562abcfb77e8d4ee8fef9ce033f9
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c19fdb0a523198817691be875750d0695dc0006f20c28aee74ce2c6f5e754fdb
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc995806a1d26375df1705636e27a9de242c2427fba3dd43fa585853178b2b24
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:748f82303db435db7a8ad88a622a99fc2b1c74bd84f8ed546b1f4733414b3ff4
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae93497bb2108e5e81c1355029bd994f1afe726f0900074266771d7a223f2a18
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cd497bb715d7ddc369134434d881bc512ec6192975735d4c15cbdcb223196e4
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36fbc7f6fa9e4cc3d70f89ca68cdad6c0f766ecec8e0660c99c73f616d503ca6
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41059b96bf7552199863826b1441616d21eed16456dd0c2a6456f0486fdeecde
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed4f0c7d27d3a2f37d3ba7b466a05d56a94bc7ee56305a4c3a09de3291b6daed
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.003162538675212549,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1058,6 +1058,356 @@
1058
  "learning_rate": 0.0004996488344396747,
1059
  "loss": 3.1223,
1060
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  }
1062
  ],
1063
  "logging_steps": 10,
@@ -1077,7 +1427,7 @@
1077
  "attributes": {}
1078
  }
1079
  },
1080
- "total_flos": 4.9047061694880154e+17,
1081
  "train_batch_size": 48,
1082
  "trial_name": null,
1083
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.004216718233616732,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1058
  "learning_rate": 0.0004996488344396747,
1059
  "loss": 3.1223,
1060
  "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.0031836222663806325,
1064
+ "grad_norm": 1.0859375,
1065
+ "learning_rate": 0.0004996453192689008,
1066
+ "loss": 3.1059,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.0032047058575487165,
1071
+ "grad_norm": 0.8984375,
1072
+ "learning_rate": 0.0004996418040981267,
1073
+ "loss": 3.0938,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.0032257894487168,
1078
+ "grad_norm": 0.96484375,
1079
+ "learning_rate": 0.0004996382889273527,
1080
+ "loss": 3.1251,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.0032468730398848835,
1085
+ "grad_norm": 0.88671875,
1086
+ "learning_rate": 0.0004996347737565787,
1087
+ "loss": 3.106,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.003267956631052967,
1092
+ "grad_norm": 0.9765625,
1093
+ "learning_rate": 0.0004996312585858046,
1094
+ "loss": 3.089,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.003289040222221051,
1099
+ "grad_norm": 0.96875,
1100
+ "learning_rate": 0.0004996277434150306,
1101
+ "loss": 3.09,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.0033101238133891346,
1106
+ "grad_norm": 0.98046875,
1107
+ "learning_rate": 0.0004996242282442566,
1108
+ "loss": 3.0891,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.003331207404557218,
1113
+ "grad_norm": 0.86328125,
1114
+ "learning_rate": 0.0004996207130734825,
1115
+ "loss": 3.0794,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.003352290995725302,
1120
+ "grad_norm": 0.921875,
1121
+ "learning_rate": 0.0004996171979027086,
1122
+ "loss": 3.0718,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.0033733745868933856,
1127
+ "grad_norm": 1.0234375,
1128
+ "learning_rate": 0.0004996136827319345,
1129
+ "loss": 3.0706,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.003394458178061469,
1134
+ "grad_norm": 1.046875,
1135
+ "learning_rate": 0.0004996101675611604,
1136
+ "loss": 3.053,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.0034155417692295527,
1141
+ "grad_norm": 0.88671875,
1142
+ "learning_rate": 0.0004996066523903865,
1143
+ "loss": 3.0476,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.0034366253603976367,
1148
+ "grad_norm": 0.8984375,
1149
+ "learning_rate": 0.0004996031372196125,
1150
+ "loss": 3.0537,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.00345770895156572,
1155
+ "grad_norm": 0.90625,
1156
+ "learning_rate": 0.0004995996220488384,
1157
+ "loss": 3.0558,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.0034787925427338037,
1162
+ "grad_norm": 0.8046875,
1163
+ "learning_rate": 0.0004995961068780644,
1164
+ "loss": 3.0366,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.0034998761339018877,
1169
+ "grad_norm": 0.9140625,
1170
+ "learning_rate": 0.0004995925917072904,
1171
+ "loss": 3.0584,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.0035209597250699712,
1176
+ "grad_norm": 0.90625,
1177
+ "learning_rate": 0.0004995890765365163,
1178
+ "loss": 3.0447,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.0035420433162380548,
1183
+ "grad_norm": 0.8984375,
1184
+ "learning_rate": 0.0004995855613657423,
1185
+ "loss": 3.0423,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.0035631269074061383,
1190
+ "grad_norm": 0.91015625,
1191
+ "learning_rate": 0.0004995820461949682,
1192
+ "loss": 3.0278,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.0035842104985742223,
1197
+ "grad_norm": 0.8359375,
1198
+ "learning_rate": 0.0004995785310241942,
1199
+ "loss": 3.0404,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.003605294089742306,
1204
+ "grad_norm": 0.93359375,
1205
+ "learning_rate": 0.0004995750158534202,
1206
+ "loss": 3.0173,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.0036263776809103893,
1211
+ "grad_norm": 0.83203125,
1212
+ "learning_rate": 0.0004995715006826461,
1213
+ "loss": 3.0233,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.0036474612720784733,
1218
+ "grad_norm": 0.80078125,
1219
+ "learning_rate": 0.0004995679855118721,
1220
+ "loss": 3.0179,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.003668544863246557,
1225
+ "grad_norm": 0.9375,
1226
+ "learning_rate": 0.0004995644703410982,
1227
+ "loss": 3.0283,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.0036896284544146404,
1232
+ "grad_norm": 0.8203125,
1233
+ "learning_rate": 0.000499560955170324,
1234
+ "loss": 3.0188,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.003710712045582724,
1239
+ "grad_norm": 1.0625,
1240
+ "learning_rate": 0.00049955743999955,
1241
+ "loss": 2.9979,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.003731795636750808,
1246
+ "grad_norm": 0.87890625,
1247
+ "learning_rate": 0.0004995539248287761,
1248
+ "loss": 2.9959,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.0037528792279188914,
1253
+ "grad_norm": 0.85546875,
1254
+ "learning_rate": 0.000499550409658002,
1255
+ "loss": 2.9987,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.003773962819086975,
1260
+ "grad_norm": 0.8828125,
1261
+ "learning_rate": 0.000499546894487228,
1262
+ "loss": 3.0055,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.003795046410255059,
1267
+ "grad_norm": 0.7734375,
1268
+ "learning_rate": 0.000499543379316454,
1269
+ "loss": 2.984,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.0038161300014231425,
1274
+ "grad_norm": 0.828125,
1275
+ "learning_rate": 0.0004995398641456799,
1276
+ "loss": 3.0139,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.003837213592591226,
1281
+ "grad_norm": 0.73046875,
1282
+ "learning_rate": 0.0004995363489749059,
1283
+ "loss": 2.9974,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.0038582971837593095,
1288
+ "grad_norm": 0.90234375,
1289
+ "learning_rate": 0.0004995328338041319,
1290
+ "loss": 2.9834,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.0038793807749273935,
1295
+ "grad_norm": 0.82421875,
1296
+ "learning_rate": 0.0004995293186333578,
1297
+ "loss": 2.9901,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.003900464366095477,
1302
+ "grad_norm": 1.03125,
1303
+ "learning_rate": 0.0004995258034625838,
1304
+ "loss": 2.9712,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.003921547957263561,
1309
+ "grad_norm": 0.84375,
1310
+ "learning_rate": 0.0004995222882918098,
1311
+ "loss": 2.9673,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.003942631548431644,
1316
+ "grad_norm": 0.9140625,
1317
+ "learning_rate": 0.0004995187731210357,
1318
+ "loss": 2.9839,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.003963715139599728,
1323
+ "grad_norm": 0.76171875,
1324
+ "learning_rate": 0.0004995152579502618,
1325
+ "loss": 2.9752,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.003984798730767812,
1330
+ "grad_norm": 0.80078125,
1331
+ "learning_rate": 0.0004995117427794878,
1332
+ "loss": 2.9529,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.004005882321935895,
1337
+ "grad_norm": 0.8125,
1338
+ "learning_rate": 0.0004995082276087136,
1339
+ "loss": 2.9564,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.004026965913103979,
1344
+ "grad_norm": 1.109375,
1345
+ "learning_rate": 0.0004995047124379397,
1346
+ "loss": 2.965,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.004048049504272063,
1351
+ "grad_norm": 0.94921875,
1352
+ "learning_rate": 0.0004995011972671657,
1353
+ "loss": 2.9564,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.004069133095440146,
1358
+ "grad_norm": 0.78515625,
1359
+ "learning_rate": 0.0004994976820963916,
1360
+ "loss": 2.9654,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.00409021668660823,
1365
+ "grad_norm": 0.875,
1366
+ "learning_rate": 0.0004994941669256176,
1367
+ "loss": 2.9514,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.004111300277776313,
1372
+ "grad_norm": 0.76953125,
1373
+ "learning_rate": 0.0004994906517548436,
1374
+ "loss": 2.9288,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.004132383868944397,
1379
+ "grad_norm": 1.015625,
1380
+ "learning_rate": 0.0004994871365840695,
1381
+ "loss": 2.9418,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.004153467460112481,
1386
+ "grad_norm": 0.734375,
1387
+ "learning_rate": 0.0004994836214132955,
1388
+ "loss": 2.9364,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.004174551051280564,
1393
+ "grad_norm": 0.78515625,
1394
+ "learning_rate": 0.0004994801062425215,
1395
+ "loss": 2.9262,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.004195634642448648,
1400
+ "grad_norm": 0.81640625,
1401
+ "learning_rate": 0.0004994765910717474,
1402
+ "loss": 2.9372,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.004216718233616732,
1407
+ "grad_norm": 0.98046875,
1408
+ "learning_rate": 0.0004994730759009734,
1409
+ "loss": 2.9266,
1410
+ "step": 2000
1411
  }
1412
  ],
1413
  "logging_steps": 10,
 
1427
  "attributes": {}
1428
  }
1429
  },
1430
+ "total_flos": 6.539445548728975e+17,
1431
  "train_batch_size": 48,
1432
  "trial_name": null,
1433
  "trial_params": null