ljcamargo commited on
Commit
96d44d1
·
verified ·
1 Parent(s): d365b00

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9b54ec1baff0ba606ce32e17ddc0f19455abd8192d57e00b521b66696511516
3
  size 3809184360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:114b891a762a51a9adf99795ed8a1397abbd097711bfd2ff3927ad599e912fbe
3
  size 3809184360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddc1f0349be2c177d29c5468bf9f41d63cd942def1ae5e394f1fa17fe03d12a0
3
  size 2458291491
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0467314eaf329ce64fa294c5b636d1d0a5db236ce1684099429a56bad1f1c530
3
  size 2458291491
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74282f459a34ecd6a82ba47c711b2e80de77520f433252a7093c4336e7b2ec86
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0367ed1b35f65855ad993f74c56f185b353ad034ccb1dbb7df8ac313fc044216
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0269de58bb5fc10e493211ef487ccaef92d922164d165ce82302b7048ea90408
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5daf118c104c253ac47840aed00a104c21470bc6d0bd2a07133bec544d92037c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7,
6
  "eval_steps": 500,
7
- "global_step": 1750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1233,6 +1233,181 @@
1233
  "learning_rate": 1.5221774193548388e-05,
1234
  "loss": 0.6212,
1235
  "step": 1750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
  }
1237
  ],
1238
  "logging_steps": 10,
@@ -1252,7 +1427,7 @@
1252
  "attributes": {}
1253
  }
1254
  },
1255
- "total_flos": 3.15998892642816e+16,
1256
  "train_batch_size": 2,
1257
  "trial_name": null,
1258
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1233
  "learning_rate": 1.5221774193548388e-05,
1234
  "loss": 0.6212,
1235
  "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.704,
1239
+ "grad_norm": 6.325471878051758,
1240
+ "learning_rate": 1.5020161290322581e-05,
1241
+ "loss": 0.3136,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.708,
1246
+ "grad_norm": 10.329962730407715,
1247
+ "learning_rate": 1.4818548387096776e-05,
1248
+ "loss": 0.4331,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.712,
1253
+ "grad_norm": 6.0466227531433105,
1254
+ "learning_rate": 1.4616935483870969e-05,
1255
+ "loss": 0.3234,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.716,
1260
+ "grad_norm": 11.955286979675293,
1261
+ "learning_rate": 1.4415322580645164e-05,
1262
+ "loss": 0.3829,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.72,
1267
+ "grad_norm": 17.199453353881836,
1268
+ "learning_rate": 1.4213709677419357e-05,
1269
+ "loss": 0.5838,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.724,
1274
+ "grad_norm": 7.430696964263916,
1275
+ "learning_rate": 1.4012096774193548e-05,
1276
+ "loss": 0.3352,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.728,
1281
+ "grad_norm": 18.183691024780273,
1282
+ "learning_rate": 1.3810483870967741e-05,
1283
+ "loss": 0.4604,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.732,
1288
+ "grad_norm": 10.275338172912598,
1289
+ "learning_rate": 1.3608870967741934e-05,
1290
+ "loss": 0.3847,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.736,
1295
+ "grad_norm": 11.798257827758789,
1296
+ "learning_rate": 1.340725806451613e-05,
1297
+ "loss": 0.4413,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.74,
1302
+ "grad_norm": 9.844663619995117,
1303
+ "learning_rate": 1.3205645161290322e-05,
1304
+ "loss": 0.4583,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.744,
1309
+ "grad_norm": 11.413599967956543,
1310
+ "learning_rate": 1.3004032258064517e-05,
1311
+ "loss": 0.311,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.748,
1316
+ "grad_norm": 14.305254936218262,
1317
+ "learning_rate": 1.280241935483871e-05,
1318
+ "loss": 0.3866,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.752,
1323
+ "grad_norm": 8.5895357131958,
1324
+ "learning_rate": 1.2600806451612903e-05,
1325
+ "loss": 0.4094,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.756,
1330
+ "grad_norm": 14.033561706542969,
1331
+ "learning_rate": 1.2399193548387098e-05,
1332
+ "loss": 0.3887,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.76,
1337
+ "grad_norm": 9.73270034790039,
1338
+ "learning_rate": 1.2197580645161291e-05,
1339
+ "loss": 0.4129,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.764,
1344
+ "grad_norm": 7.710170269012451,
1345
+ "learning_rate": 1.1995967741935484e-05,
1346
+ "loss": 0.2848,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.768,
1351
+ "grad_norm": 2.321779489517212,
1352
+ "learning_rate": 1.1794354838709679e-05,
1353
+ "loss": 0.2391,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.772,
1358
+ "grad_norm": 7.158700466156006,
1359
+ "learning_rate": 1.159274193548387e-05,
1360
+ "loss": 0.3441,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.776,
1365
+ "grad_norm": 7.0267815589904785,
1366
+ "learning_rate": 1.1391129032258065e-05,
1367
+ "loss": 0.3391,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.78,
1372
+ "grad_norm": 8.56450080871582,
1373
+ "learning_rate": 1.1189516129032258e-05,
1374
+ "loss": 0.4174,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.784,
1379
+ "grad_norm": 13.748787879943848,
1380
+ "learning_rate": 1.0987903225806453e-05,
1381
+ "loss": 0.3004,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.788,
1386
+ "grad_norm": 12.583123207092285,
1387
+ "learning_rate": 1.0786290322580646e-05,
1388
+ "loss": 0.3205,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.792,
1393
+ "grad_norm": 25.797645568847656,
1394
+ "learning_rate": 1.0584677419354839e-05,
1395
+ "loss": 0.327,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.796,
1400
+ "grad_norm": 5.37972354888916,
1401
+ "learning_rate": 1.0383064516129034e-05,
1402
+ "loss": 0.3086,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.8,
1407
+ "grad_norm": 11.24709701538086,
1408
+ "learning_rate": 1.0181451612903227e-05,
1409
+ "loss": 0.2809,
1410
+ "step": 2000
1411
  }
1412
  ],
1413
  "logging_steps": 10,
 
1427
  "attributes": {}
1428
  }
1429
  },
1430
+ "total_flos": 3.61046831887872e+16,
1431
  "train_batch_size": 2,
1432
  "trial_name": null,
1433
  "trial_params": null