ljcamargo commited on
Commit
bd5252d
·
verified ·
1 Parent(s): 0ca8d10

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d74e94ad70bf2300a6f5f7498cc8db84819565ea3b55fa897d35134d2ab4382f
3
  size 1917255968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ecbb21d1e0fe04a76374b42b85859839cd5847c3b922def8d9c835efea99e0
3
  size 1917255968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3207fa8fc27939a3a2430169ac40a34063364307b65159c50517dd3c9d0903c8
3
  size 2479129381
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb40c6114ec922d7714527c3e12b5ccaf476fde4ca857ba014ddc6cfb4ede0c4
3
  size 2479129381
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c800b778fa7e115e4c34de8529902de8b61c9a1b4bab3eb8295d06dafff030e
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:169caddb558d3e6f8e851fcfc2118f9ce7d97665c7c298e0fc9db7186bcdd5a6
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:946649cc0ec301f9f67d287ff0bc2472a821330c5cd88309b298943469bb0e90
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9659502535619415,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1065,6 +1065,356 @@
1065
  "learning_rate": 2.8464381903616438e-05,
1066
  "loss": 0.3255,
1067
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1068
  }
1069
  ],
1070
  "logging_steps": 10,
@@ -1084,7 +1434,7 @@
1084
  "attributes": {}
1085
  }
1086
  },
1087
- "total_flos": 1.171541987873833e+17,
1088
  "train_batch_size": 4,
1089
  "trial_name": null,
1090
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2878531755614586,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1065
  "learning_rate": 2.8464381903616438e-05,
1066
  "loss": 0.3255,
1067
  "step": 1500
1068
+ },
1069
+ {
1070
+ "epoch": 0.9723899219190212,
1071
+ "grad_norm": 10.125,
1072
+ "learning_rate": 2.8200432913515235e-05,
1073
+ "loss": 0.3507,
1074
+ "step": 1510
1075
+ },
1076
+ {
1077
+ "epoch": 0.9788295902761008,
1078
+ "grad_norm": 17.0,
1079
+ "learning_rate": 2.7936120715902957e-05,
1080
+ "loss": 0.4079,
1081
+ "step": 1520
1082
+ },
1083
+ {
1084
+ "epoch": 0.9852692586331804,
1085
+ "grad_norm": 11.6875,
1086
+ "learning_rate": 2.7671475306776396e-05,
1087
+ "loss": 0.4972,
1088
+ "step": 1530
1089
+ },
1090
+ {
1091
+ "epoch": 0.99170892699026,
1092
+ "grad_norm": 10.4375,
1093
+ "learning_rate": 2.740652671994754e-05,
1094
+ "loss": 0.3757,
1095
+ "step": 1540
1096
+ },
1097
+ {
1098
+ "epoch": 0.9981485953473396,
1099
+ "grad_norm": 8.0625,
1100
+ "learning_rate": 2.7141305023635083e-05,
1101
+ "loss": 0.3437,
1102
+ "step": 1550
1103
+ },
1104
+ {
1105
+ "epoch": 1.0045077678499557,
1106
+ "grad_norm": 11.25,
1107
+ "learning_rate": 2.687584031705211e-05,
1108
+ "loss": 0.2757,
1109
+ "step": 1560
1110
+ },
1111
+ {
1112
+ "epoch": 1.0109474362070354,
1113
+ "grad_norm": 7.03125,
1114
+ "learning_rate": 2.6610162726990185e-05,
1115
+ "loss": 0.2032,
1116
+ "step": 1570
1117
+ },
1118
+ {
1119
+ "epoch": 1.017387104564115,
1120
+ "grad_norm": 10.4375,
1121
+ "learning_rate": 2.6344302404400417e-05,
1122
+ "loss": 0.2548,
1123
+ "step": 1580
1124
+ },
1125
+ {
1126
+ "epoch": 1.0238267729211945,
1127
+ "grad_norm": 8.75,
1128
+ "learning_rate": 2.607828952097165e-05,
1129
+ "loss": 0.2578,
1130
+ "step": 1590
1131
+ },
1132
+ {
1133
+ "epoch": 1.0302664412782743,
1134
+ "grad_norm": 10.75,
1135
+ "learning_rate": 2.5812154265706413e-05,
1136
+ "loss": 0.2078,
1137
+ "step": 1600
1138
+ },
1139
+ {
1140
+ "epoch": 1.0367061096353538,
1141
+ "grad_norm": 11.375,
1142
+ "learning_rate": 2.5545926841494826e-05,
1143
+ "loss": 0.1952,
1144
+ "step": 1610
1145
+ },
1146
+ {
1147
+ "epoch": 1.0431457779924334,
1148
+ "grad_norm": 7.6875,
1149
+ "learning_rate": 2.5279637461687026e-05,
1150
+ "loss": 0.1958,
1151
+ "step": 1620
1152
+ },
1153
+ {
1154
+ "epoch": 1.049585446349513,
1155
+ "grad_norm": 9.9375,
1156
+ "learning_rate": 2.5013316346664273e-05,
1157
+ "loss": 0.2133,
1158
+ "step": 1630
1159
+ },
1160
+ {
1161
+ "epoch": 1.0560251147065927,
1162
+ "grad_norm": 8.75,
1163
+ "learning_rate": 2.474699372040936e-05,
1164
+ "loss": 0.2355,
1165
+ "step": 1640
1166
+ },
1167
+ {
1168
+ "epoch": 1.0624647830636722,
1169
+ "grad_norm": 9.9375,
1170
+ "learning_rate": 2.4480699807076592e-05,
1171
+ "loss": 0.225,
1172
+ "step": 1650
1173
+ },
1174
+ {
1175
+ "epoch": 1.0689044514207517,
1176
+ "grad_norm": 9.125,
1177
+ "learning_rate": 2.4214464827561752e-05,
1178
+ "loss": 0.272,
1179
+ "step": 1660
1180
+ },
1181
+ {
1182
+ "epoch": 1.0753441197778315,
1183
+ "grad_norm": 12.4375,
1184
+ "learning_rate": 2.3948318996072363e-05,
1185
+ "loss": 0.2001,
1186
+ "step": 1670
1187
+ },
1188
+ {
1189
+ "epoch": 1.081783788134911,
1190
+ "grad_norm": 7.375,
1191
+ "learning_rate": 2.3682292516698832e-05,
1192
+ "loss": 0.183,
1193
+ "step": 1680
1194
+ },
1195
+ {
1196
+ "epoch": 1.0882234564919906,
1197
+ "grad_norm": 7.71875,
1198
+ "learning_rate": 2.3416415579986643e-05,
1199
+ "loss": 0.1954,
1200
+ "step": 1690
1201
+ },
1202
+ {
1203
+ "epoch": 1.0946631248490704,
1204
+ "grad_norm": 6.625,
1205
+ "learning_rate": 2.3150718359510142e-05,
1206
+ "loss": 0.1848,
1207
+ "step": 1700
1208
+ },
1209
+ {
1210
+ "epoch": 1.10110279320615,
1211
+ "grad_norm": 8.5,
1212
+ "learning_rate": 2.2885231008448216e-05,
1213
+ "loss": 0.1769,
1214
+ "step": 1710
1215
+ },
1216
+ {
1217
+ "epoch": 1.1075424615632294,
1218
+ "grad_norm": 17.875,
1219
+ "learning_rate": 2.261998365616228e-05,
1220
+ "loss": 0.2485,
1221
+ "step": 1720
1222
+ },
1223
+ {
1224
+ "epoch": 1.1139821299203092,
1225
+ "grad_norm": 9.9375,
1226
+ "learning_rate": 2.2355006404777057e-05,
1227
+ "loss": 0.1795,
1228
+ "step": 1730
1229
+ },
1230
+ {
1231
+ "epoch": 1.1204217982773887,
1232
+ "grad_norm": 8.375,
1233
+ "learning_rate": 2.2090329325764294e-05,
1234
+ "loss": 0.2424,
1235
+ "step": 1740
1236
+ },
1237
+ {
1238
+ "epoch": 1.1268614666344683,
1239
+ "grad_norm": 8.5,
1240
+ "learning_rate": 2.1825982456530086e-05,
1241
+ "loss": 0.203,
1242
+ "step": 1750
1243
+ },
1244
+ {
1245
+ "epoch": 1.133301134991548,
1246
+ "grad_norm": 6.0,
1247
+ "learning_rate": 2.1561995797006017e-05,
1248
+ "loss": 0.2003,
1249
+ "step": 1760
1250
+ },
1251
+ {
1252
+ "epoch": 1.1397408033486276,
1253
+ "grad_norm": 18.5,
1254
+ "learning_rate": 2.129839930624454e-05,
1255
+ "loss": 0.1599,
1256
+ "step": 1770
1257
+ },
1258
+ {
1259
+ "epoch": 1.1461804717057071,
1260
+ "grad_norm": 11.375,
1261
+ "learning_rate": 2.1035222899019057e-05,
1262
+ "loss": 0.2198,
1263
+ "step": 1780
1264
+ },
1265
+ {
1266
+ "epoch": 1.1526201400627867,
1267
+ "grad_norm": 6.65625,
1268
+ "learning_rate": 2.0772496442428914e-05,
1269
+ "loss": 0.1721,
1270
+ "step": 1790
1271
+ },
1272
+ {
1273
+ "epoch": 1.1590598084198664,
1274
+ "grad_norm": 6.625,
1275
+ "learning_rate": 2.051024975250996e-05,
1276
+ "loss": 0.1557,
1277
+ "step": 1800
1278
+ },
1279
+ {
1280
+ "epoch": 1.165499476776946,
1281
+ "grad_norm": 7.96875,
1282
+ "learning_rate": 2.0248512590850716e-05,
1283
+ "loss": 0.205,
1284
+ "step": 1810
1285
+ },
1286
+ {
1287
+ "epoch": 1.1719391451340255,
1288
+ "grad_norm": 5.53125,
1289
+ "learning_rate": 1.9987314661214904e-05,
1290
+ "loss": 0.203,
1291
+ "step": 1820
1292
+ },
1293
+ {
1294
+ "epoch": 1.1783788134911053,
1295
+ "grad_norm": 20.5,
1296
+ "learning_rate": 1.9726685606170415e-05,
1297
+ "loss": 0.2571,
1298
+ "step": 1830
1299
+ },
1300
+ {
1301
+ "epoch": 1.1848184818481848,
1302
+ "grad_norm": 8.25,
1303
+ "learning_rate": 1.9466655003725273e-05,
1304
+ "loss": 0.2108,
1305
+ "step": 1840
1306
+ },
1307
+ {
1308
+ "epoch": 1.1912581502052644,
1309
+ "grad_norm": 10.375,
1310
+ "learning_rate": 1.920725236397091e-05,
1311
+ "loss": 0.2667,
1312
+ "step": 1850
1313
+ },
1314
+ {
1315
+ "epoch": 1.197697818562344,
1316
+ "grad_norm": 5.71875,
1317
+ "learning_rate": 1.8948507125733177e-05,
1318
+ "loss": 0.1615,
1319
+ "step": 1860
1320
+ },
1321
+ {
1322
+ "epoch": 1.2041374869194237,
1323
+ "grad_norm": 5.5625,
1324
+ "learning_rate": 1.869044865323138e-05,
1325
+ "loss": 0.2647,
1326
+ "step": 1870
1327
+ },
1328
+ {
1329
+ "epoch": 1.2105771552765032,
1330
+ "grad_norm": 5.90625,
1331
+ "learning_rate": 1.843310623274587e-05,
1332
+ "loss": 0.1771,
1333
+ "step": 1880
1334
+ },
1335
+ {
1336
+ "epoch": 1.2170168236335828,
1337
+ "grad_norm": 5.28125,
1338
+ "learning_rate": 1.8176509069294396e-05,
1339
+ "loss": 0.1781,
1340
+ "step": 1890
1341
+ },
1342
+ {
1343
+ "epoch": 1.2234564919906625,
1344
+ "grad_norm": 6.78125,
1345
+ "learning_rate": 1.7920686283317712e-05,
1346
+ "loss": 0.167,
1347
+ "step": 1900
1348
+ },
1349
+ {
1350
+ "epoch": 1.229896160347742,
1351
+ "grad_norm": 8.0625,
1352
+ "learning_rate": 1.7665666907374822e-05,
1353
+ "loss": 0.1785,
1354
+ "step": 1910
1355
+ },
1356
+ {
1357
+ "epoch": 1.2363358287048216,
1358
+ "grad_norm": 6.71875,
1359
+ "learning_rate": 1.741147988284813e-05,
1360
+ "loss": 0.1816,
1361
+ "step": 1920
1362
+ },
1363
+ {
1364
+ "epoch": 1.2427754970619014,
1365
+ "grad_norm": 11.25,
1366
+ "learning_rate": 1.715815405665897e-05,
1367
+ "loss": 0.1934,
1368
+ "step": 1930
1369
+ },
1370
+ {
1371
+ "epoch": 1.249215165418981,
1372
+ "grad_norm": 8.125,
1373
+ "learning_rate": 1.6905718177993897e-05,
1374
+ "loss": 0.1683,
1375
+ "step": 1940
1376
+ },
1377
+ {
1378
+ "epoch": 1.2556548337760605,
1379
+ "grad_norm": 8.4375,
1380
+ "learning_rate": 1.6654200895041962e-05,
1381
+ "loss": 0.1812,
1382
+ "step": 1950
1383
+ },
1384
+ {
1385
+ "epoch": 1.2620945021331402,
1386
+ "grad_norm": 7.25,
1387
+ "learning_rate": 1.6403630751743576e-05,
1388
+ "loss": 0.1787,
1389
+ "step": 1960
1390
+ },
1391
+ {
1392
+ "epoch": 1.2685341704902198,
1393
+ "grad_norm": 6.0625,
1394
+ "learning_rate": 1.6154036184551098e-05,
1395
+ "loss": 0.1483,
1396
+ "step": 1970
1397
+ },
1398
+ {
1399
+ "epoch": 1.2749738388472993,
1400
+ "grad_norm": 9.0,
1401
+ "learning_rate": 1.5905445519201694e-05,
1402
+ "loss": 0.3355,
1403
+ "step": 1980
1404
+ },
1405
+ {
1406
+ "epoch": 1.281413507204379,
1407
+ "grad_norm": 9.0,
1408
+ "learning_rate": 1.5657886967502733e-05,
1409
+ "loss": 0.3235,
1410
+ "step": 1990
1411
+ },
1412
+ {
1413
+ "epoch": 1.2878531755614586,
1414
+ "grad_norm": 4.15625,
1415
+ "learning_rate": 1.541138862413009e-05,
1416
+ "loss": 0.1555,
1417
+ "step": 2000
1418
  }
1419
  ],
1420
  "logging_steps": 10,
 
1434
  "attributes": {}
1435
  }
1436
  },
1437
+ "total_flos": 1.5616435963670323e+17,
1438
  "train_batch_size": 4,
1439
  "trial_name": null,
1440
  "trial_params": null