ljcamargo commited on
Commit
f7cd1b6
·
verified ·
1 Parent(s): e2e234f

Training in progress, step 2400, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a11ea4246169d4ba1b9cdfd1ec7ef840b6b697d514813efd0057d57657a9241f
3
  size 2558403928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed21612abd96e9f3180e59a523332882fb8b7e33bf01c2e16df3203238ea60a4
3
  size 2558403928
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bff2dd91c1612009877ee3b54bb35e744dcc95587bcbfa8944d30b0c5da6f76
3
  size 1313638993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf19ccf0e0d82bcdae677e010373be8ed4cd7f9ff952f45ada24714cfe7fae7
3
  size 1313638993
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f78d4bac68df9071c97615b2e87cf8d0471f01b17ef6d500cfcd39134f131824
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e44c6b7e3801cf2cadd741e677cb1212309bde8f6d206a1348cce5d738d15c3
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc99556bf27209385963813e3570510732839e6002d61d657266050e280a33eb
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7da7c5085795b13d2bf0030671cbddb9f62ae43221bf1424a3830d4cf8c19012
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d4ff6faba9116c06367ce60af1ab0ce82d05c5571557409e66e7e9ab509c55a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ae14ab968b2e9bd5cc3fce1b0eaa2a5d11fdc6bd9129e17f4ebc5e3b9704808
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.84,
6
  "eval_steps": 500,
7
- "global_step": 2100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1485,6 +1485,216 @@
1485
  "learning_rate": 3.23278126679408e-06,
1486
  "loss": 2.9846,
1487
  "step": 2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1488
  }
1489
  ],
1490
  "logging_steps": 10,
@@ -1504,7 +1714,7 @@
1504
  "attributes": {}
1505
  }
1506
  },
1507
- "total_flos": 9.87292665741312e+19,
1508
  "train_batch_size": 8,
1509
  "trial_name": null,
1510
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.96,
6
  "eval_steps": 500,
7
+ "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1485
  "learning_rate": 3.23278126679408e-06,
1486
  "loss": 2.9846,
1487
  "step": 2100
1488
+ },
1489
+ {
1490
+ "epoch": 0.844,
1491
+ "grad_norm": 9.084943771362305,
1492
+ "learning_rate": 3.0769074037237583e-06,
1493
+ "loss": 2.9903,
1494
+ "step": 2110
1495
+ },
1496
+ {
1497
+ "epoch": 0.848,
1498
+ "grad_norm": 6.5540595054626465,
1499
+ "learning_rate": 2.9246381934887684e-06,
1500
+ "loss": 3.2851,
1501
+ "step": 2120
1502
+ },
1503
+ {
1504
+ "epoch": 0.852,
1505
+ "grad_norm": 7.740701675415039,
1506
+ "learning_rate": 2.7759986725951703e-06,
1507
+ "loss": 2.9797,
1508
+ "step": 2130
1509
+ },
1510
+ {
1511
+ "epoch": 0.856,
1512
+ "grad_norm": 10.074856758117676,
1513
+ "learning_rate": 2.6310132807458894e-06,
1514
+ "loss": 3.1325,
1515
+ "step": 2140
1516
+ },
1517
+ {
1518
+ "epoch": 0.86,
1519
+ "grad_norm": 10.44127368927002,
1520
+ "learning_rate": 2.4897058568223137e-06,
1521
+ "loss": 3.0159,
1522
+ "step": 2150
1523
+ },
1524
+ {
1525
+ "epoch": 0.864,
1526
+ "grad_norm": 9.894632339477539,
1527
+ "learning_rate": 2.3520996349645995e-06,
1528
+ "loss": 2.8015,
1529
+ "step": 2160
1530
+ },
1531
+ {
1532
+ "epoch": 0.868,
1533
+ "grad_norm": 9.043245315551758,
1534
+ "learning_rate": 2.218217240751491e-06,
1535
+ "loss": 3.4477,
1536
+ "step": 2170
1537
+ },
1538
+ {
1539
+ "epoch": 0.872,
1540
+ "grad_norm": 9.901315689086914,
1541
+ "learning_rate": 2.088080687480151e-06,
1542
+ "loss": 3.3157,
1543
+ "step": 2180
1544
+ },
1545
+ {
1546
+ "epoch": 0.876,
1547
+ "grad_norm": 8.202696800231934,
1548
+ "learning_rate": 1.961711372546657e-06,
1549
+ "loss": 2.9467,
1550
+ "step": 2190
1551
+ },
1552
+ {
1553
+ "epoch": 0.88,
1554
+ "grad_norm": 8.691917419433594,
1555
+ "learning_rate": 1.8391300739278139e-06,
1556
+ "loss": 2.9079,
1557
+ "step": 2200
1558
+ },
1559
+ {
1560
+ "epoch": 0.884,
1561
+ "grad_norm": 13.363630294799805,
1562
+ "learning_rate": 1.7203569467647674e-06,
1563
+ "loss": 3.2583,
1564
+ "step": 2210
1565
+ },
1566
+ {
1567
+ "epoch": 0.888,
1568
+ "grad_norm": 11.732659339904785,
1569
+ "learning_rate": 1.6054115200490493e-06,
1570
+ "loss": 3.0431,
1571
+ "step": 2220
1572
+ },
1573
+ {
1574
+ "epoch": 0.892,
1575
+ "grad_norm": 7.8193230628967285,
1576
+ "learning_rate": 1.4943126934115536e-06,
1577
+ "loss": 3.1155,
1578
+ "step": 2230
1579
+ },
1580
+ {
1581
+ "epoch": 0.896,
1582
+ "grad_norm": 6.232199192047119,
1583
+ "learning_rate": 1.3870787340150376e-06,
1584
+ "loss": 3.2006,
1585
+ "step": 2240
1586
+ },
1587
+ {
1588
+ "epoch": 0.9,
1589
+ "grad_norm": 5.650846004486084,
1590
+ "learning_rate": 1.2837272735505668e-06,
1591
+ "loss": 2.8882,
1592
+ "step": 2250
1593
+ },
1594
+ {
1595
+ "epoch": 0.904,
1596
+ "grad_norm": 7.191598892211914,
1597
+ "learning_rate": 1.1842753053384559e-06,
1598
+ "loss": 3.0833,
1599
+ "step": 2260
1600
+ },
1601
+ {
1602
+ "epoch": 0.908,
1603
+ "grad_norm": 8.854833602905273,
1604
+ "learning_rate": 1.0887391815342124e-06,
1605
+ "loss": 3.3196,
1606
+ "step": 2270
1607
+ },
1608
+ {
1609
+ "epoch": 0.912,
1610
+ "grad_norm": 13.160386085510254,
1611
+ "learning_rate": 9.971346104398455e-07,
1612
+ "loss": 3.564,
1613
+ "step": 2280
1614
+ },
1615
+ {
1616
+ "epoch": 0.916,
1617
+ "grad_norm": 8.540671348571777,
1618
+ "learning_rate": 9.09476653921082e-07,
1619
+ "loss": 3.1383,
1620
+ "step": 2290
1621
+ },
1622
+ {
1623
+ "epoch": 0.92,
1624
+ "grad_norm": 12.331473350524902,
1625
+ "learning_rate": 8.257797249308419e-07,
1626
+ "loss": 3.259,
1627
+ "step": 2300
1628
+ },
1629
+ {
1630
+ "epoch": 0.924,
1631
+ "grad_norm": 7.576813697814941,
1632
+ "learning_rate": 7.460575851394341e-07,
1633
+ "loss": 2.8659,
1634
+ "step": 2310
1635
+ },
1636
+ {
1637
+ "epoch": 0.928,
1638
+ "grad_norm": 6.937955379486084,
1639
+ "learning_rate": 6.703233426718136e-07,
1640
+ "loss": 2.9416,
1641
+ "step": 2320
1642
+ },
1643
+ {
1644
+ "epoch": 0.932,
1645
+ "grad_norm": 7.9867777824401855,
1646
+ "learning_rate": 5.985894499523193e-07,
1647
+ "loss": 3.0008,
1648
+ "step": 2330
1649
+ },
1650
+ {
1651
+ "epoch": 0.936,
1652
+ "grad_norm": 10.474209785461426,
1653
+ "learning_rate": 5.308677016572145e-07,
1654
+ "loss": 3.6042,
1655
+ "step": 2340
1656
+ },
1657
+ {
1658
+ "epoch": 0.94,
1659
+ "grad_norm": 6.954331398010254,
1660
+ "learning_rate": 4.6716923277536627e-07,
1661
+ "loss": 2.696,
1662
+ "step": 2350
1663
+ },
1664
+ {
1665
+ "epoch": 0.944,
1666
+ "grad_norm": 6.555063247680664,
1667
+ "learning_rate": 4.075045167774072e-07,
1668
+ "loss": 3.2311,
1669
+ "step": 2360
1670
+ },
1671
+ {
1672
+ "epoch": 0.948,
1673
+ "grad_norm": 7.122920513153076,
1674
+ "learning_rate": 3.518833638936514e-07,
1675
+ "loss": 3.1349,
1676
+ "step": 2370
1677
+ },
1678
+ {
1679
+ "epoch": 0.952,
1680
+ "grad_norm": 10.269899368286133,
1681
+ "learning_rate": 3.003149195010907e-07,
1682
+ "loss": 2.9381,
1683
+ "step": 2380
1684
+ },
1685
+ {
1686
+ "epoch": 0.956,
1687
+ "grad_norm": 8.958882331848145,
1688
+ "learning_rate": 2.528076626196585e-07,
1689
+ "loss": 3.0804,
1690
+ "step": 2390
1691
+ },
1692
+ {
1693
+ "epoch": 0.96,
1694
+ "grad_norm": 11.036646842956543,
1695
+ "learning_rate": 2.0936940451811437e-07,
1696
+ "loss": 3.0191,
1697
+ "step": 2400
1698
  }
1699
  ],
1700
  "logging_steps": 10,
 
1714
  "attributes": {}
1715
  }
1716
  },
1717
+ "total_flos": 1.128334475132928e+20,
1718
  "train_batch_size": 8,
1719
  "trial_name": null,
1720
  "trial_params": null