3N3G commited on
Commit
4395a23
·
verified ·
1 Parent(s): e087874

Training in progress, step 208, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2a90a3fb8be7c015f5d69d981f401deb0a1c7090ca756c441db00cf9f12ce9f
3
  size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a7626c3332382c720b25d7028428e6e693206a85b1d278123f350e6447c549
3
  size 4969539560
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a58a8899b511d48c4f12a03907873a2e2af75be92f7e47ba879a77cc9687488
3
  size 1912795688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:674ac2c674747082527a37e7013363c3374ff004d5b78edf91c3585792370cd4
3
  size 1912795688
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 44.0,
6
  "eval_steps": 16,
7
- "global_step": 176,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1328,6 +1328,246 @@
1328
  "eval_samples_per_second": 16.828,
1329
  "eval_steps_per_second": 16.828,
1330
  "step": 176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1331
  }
1332
  ],
1333
  "logging_steps": 1,
@@ -1347,7 +1587,7 @@
1347
  "attributes": {}
1348
  }
1349
  },
1350
- "total_flos": 5.902744584388608e+16,
1351
  "train_batch_size": 1,
1352
  "trial_name": null,
1353
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 52.0,
6
  "eval_steps": 16,
7
+ "global_step": 208,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1328
  "eval_samples_per_second": 16.828,
1329
  "eval_steps_per_second": 16.828,
1330
  "step": 176
1331
+ },
1332
+ {
1333
+ "epoch": 44.29090909090909,
1334
+ "grad_norm": 6.314858436584473,
1335
+ "learning_rate": 7.185729670371604e-08,
1336
+ "loss": 0.7001,
1337
+ "step": 177
1338
+ },
1339
+ {
1340
+ "epoch": 44.58181818181818,
1341
+ "grad_norm": 6.364148139953613,
1342
+ "learning_rate": 7.149255520259337e-08,
1343
+ "loss": 0.786,
1344
+ "step": 178
1345
+ },
1346
+ {
1347
+ "epoch": 44.872727272727275,
1348
+ "grad_norm": 5.679451942443848,
1349
+ "learning_rate": 7.11265577295385e-08,
1350
+ "loss": 0.6767,
1351
+ "step": 179
1352
+ },
1353
+ {
1354
+ "epoch": 45.0,
1355
+ "grad_norm": 6.4454216957092285,
1356
+ "learning_rate": 7.075933215667603e-08,
1357
+ "loss": 0.7351,
1358
+ "step": 180
1359
+ },
1360
+ {
1361
+ "epoch": 45.29090909090909,
1362
+ "grad_norm": 5.991427421569824,
1363
+ "learning_rate": 7.039090644965509e-08,
1364
+ "loss": 0.7047,
1365
+ "step": 181
1366
+ },
1367
+ {
1368
+ "epoch": 45.58181818181818,
1369
+ "grad_norm": 5.386115550994873,
1370
+ "learning_rate": 7.002130866551968e-08,
1371
+ "loss": 0.7113,
1372
+ "step": 182
1373
+ },
1374
+ {
1375
+ "epoch": 45.872727272727275,
1376
+ "grad_norm": 6.815364360809326,
1377
+ "learning_rate": 6.965056695057204e-08,
1378
+ "loss": 0.7255,
1379
+ "step": 183
1380
+ },
1381
+ {
1382
+ "epoch": 46.0,
1383
+ "grad_norm": 6.38714599609375,
1384
+ "learning_rate": 6.927870953822915e-08,
1385
+ "loss": 0.7503,
1386
+ "step": 184
1387
+ },
1388
+ {
1389
+ "epoch": 46.29090909090909,
1390
+ "grad_norm": 5.759856224060059,
1391
+ "learning_rate": 6.890576474687262e-08,
1392
+ "loss": 0.7008,
1393
+ "step": 185
1394
+ },
1395
+ {
1396
+ "epoch": 46.58181818181818,
1397
+ "grad_norm": 5.1396918296813965,
1398
+ "learning_rate": 6.853176097769228e-08,
1399
+ "loss": 0.6925,
1400
+ "step": 186
1401
+ },
1402
+ {
1403
+ "epoch": 46.872727272727275,
1404
+ "grad_norm": 5.9070539474487305,
1405
+ "learning_rate": 6.815672671252315e-08,
1406
+ "loss": 0.7409,
1407
+ "step": 187
1408
+ },
1409
+ {
1410
+ "epoch": 47.0,
1411
+ "grad_norm": 5.90541410446167,
1412
+ "learning_rate": 6.778069051167653e-08,
1413
+ "loss": 0.702,
1414
+ "step": 188
1415
+ },
1416
+ {
1417
+ "epoch": 47.29090909090909,
1418
+ "grad_norm": 5.474076747894287,
1419
+ "learning_rate": 6.740368101176495e-08,
1420
+ "loss": 0.7085,
1421
+ "step": 189
1422
+ },
1423
+ {
1424
+ "epoch": 47.58181818181818,
1425
+ "grad_norm": 5.111520767211914,
1426
+ "learning_rate": 6.702572692352155e-08,
1427
+ "loss": 0.685,
1428
+ "step": 190
1429
+ },
1430
+ {
1431
+ "epoch": 47.872727272727275,
1432
+ "grad_norm": 5.618140697479248,
1433
+ "learning_rate": 6.664685702961344e-08,
1434
+ "loss": 0.7551,
1435
+ "step": 191
1436
+ },
1437
+ {
1438
+ "epoch": 48.0,
1439
+ "grad_norm": 4.961245059967041,
1440
+ "learning_rate": 6.626710018244986e-08,
1441
+ "loss": 0.6327,
1442
+ "step": 192
1443
+ },
1444
+ {
1445
+ "epoch": 48.0,
1446
+ "eval_loss": 0.6752312183380127,
1447
+ "eval_runtime": 0.7832,
1448
+ "eval_samples_per_second": 16.599,
1449
+ "eval_steps_per_second": 16.599,
1450
+ "step": 192
1451
+ },
1452
+ {
1453
+ "epoch": 48.29090909090909,
1454
+ "grad_norm": 5.36975622177124,
1455
+ "learning_rate": 6.588648530198504e-08,
1456
+ "loss": 0.7312,
1457
+ "step": 193
1458
+ },
1459
+ {
1460
+ "epoch": 48.58181818181818,
1461
+ "grad_norm": 5.021007061004639,
1462
+ "learning_rate": 6.550504137351574e-08,
1463
+ "loss": 0.7467,
1464
+ "step": 194
1465
+ },
1466
+ {
1467
+ "epoch": 48.872727272727275,
1468
+ "grad_norm": 4.721583843231201,
1469
+ "learning_rate": 6.512279744547392e-08,
1470
+ "loss": 0.6271,
1471
+ "step": 195
1472
+ },
1473
+ {
1474
+ "epoch": 49.0,
1475
+ "grad_norm": 5.531439304351807,
1476
+ "learning_rate": 6.473978262721462e-08,
1477
+ "loss": 0.7127,
1478
+ "step": 196
1479
+ },
1480
+ {
1481
+ "epoch": 49.29090909090909,
1482
+ "grad_norm": 5.3525309562683105,
1483
+ "learning_rate": 6.435602608679917e-08,
1484
+ "loss": 0.7255,
1485
+ "step": 197
1486
+ },
1487
+ {
1488
+ "epoch": 49.58181818181818,
1489
+ "grad_norm": 4.411137104034424,
1490
+ "learning_rate": 6.397155704877387e-08,
1491
+ "loss": 0.6177,
1492
+ "step": 198
1493
+ },
1494
+ {
1495
+ "epoch": 49.872727272727275,
1496
+ "grad_norm": 4.907252788543701,
1497
+ "learning_rate": 6.358640479194451e-08,
1498
+ "loss": 0.7295,
1499
+ "step": 199
1500
+ },
1501
+ {
1502
+ "epoch": 50.0,
1503
+ "grad_norm": 4.626101493835449,
1504
+ "learning_rate": 6.320059864714664e-08,
1505
+ "loss": 0.7091,
1506
+ "step": 200
1507
+ },
1508
+ {
1509
+ "epoch": 50.29090909090909,
1510
+ "grad_norm": 4.853626728057861,
1511
+ "learning_rate": 6.281416799501187e-08,
1512
+ "loss": 0.7432,
1513
+ "step": 201
1514
+ },
1515
+ {
1516
+ "epoch": 50.58181818181818,
1517
+ "grad_norm": 4.439899921417236,
1518
+ "learning_rate": 6.242714226373049e-08,
1519
+ "loss": 0.676,
1520
+ "step": 202
1521
+ },
1522
+ {
1523
+ "epoch": 50.872727272727275,
1524
+ "grad_norm": 4.5280985832214355,
1525
+ "learning_rate": 6.203955092681039e-08,
1526
+ "loss": 0.7086,
1527
+ "step": 203
1528
+ },
1529
+ {
1530
+ "epoch": 51.0,
1531
+ "grad_norm": 4.414018154144287,
1532
+ "learning_rate": 6.165142350083249e-08,
1533
+ "loss": 0.5264,
1534
+ "step": 204
1535
+ },
1536
+ {
1537
+ "epoch": 51.29090909090909,
1538
+ "grad_norm": 4.17572021484375,
1539
+ "learning_rate": 6.126278954320294e-08,
1540
+ "loss": 0.7346,
1541
+ "step": 205
1542
+ },
1543
+ {
1544
+ "epoch": 51.58181818181818,
1545
+ "grad_norm": 4.015255928039551,
1546
+ "learning_rate": 6.087367864990232e-08,
1547
+ "loss": 0.6239,
1548
+ "step": 206
1549
+ },
1550
+ {
1551
+ "epoch": 51.872727272727275,
1552
+ "grad_norm": 4.698182582855225,
1553
+ "learning_rate": 6.048412045323163e-08,
1554
+ "loss": 0.688,
1555
+ "step": 207
1556
+ },
1557
+ {
1558
+ "epoch": 52.0,
1559
+ "grad_norm": 5.5075297355651855,
1560
+ "learning_rate": 6.00941446195558e-08,
1561
+ "loss": 0.6903,
1562
+ "step": 208
1563
+ },
1564
+ {
1565
+ "epoch": 52.0,
1566
+ "eval_loss": 0.6604220271110535,
1567
+ "eval_runtime": 0.6915,
1568
+ "eval_samples_per_second": 18.8,
1569
+ "eval_steps_per_second": 18.8,
1570
+ "step": 208
1571
  }
1572
  ],
1573
  "logging_steps": 1,
 
1587
  "attributes": {}
1588
  }
1589
  },
1590
+ "total_flos": 6.975970872459264e+16,
1591
  "train_batch_size": 1,
1592
  "trial_name": null,
1593
  "trial_params": null