shulijia commited on
Commit
235cf9a
·
verified ·
1 Parent(s): f4225fe

Training in progress, step 1930, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecb1c43c9a6caae32eedb74afa839b6db8df1604949ac103d65018afa63a05ce
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7ee4397f7ac55478163e72ab579b35e42a987ee8ac4921495614e46cb4fa3fd
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d22ddd6eb04ea57bbea8812aaf6fcc69925e5324bb385f88fbe59698add4850d
3
  size 4768662910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f16b1b30bde836034c65491c3dbf61376ed4653b1ed3d9965a4aacdd5e4b53be
3
  size 4768662910
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52cca5856c568bc52c683b690919168fa27bfbdfefc6e0a62355afa6011157c3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9afbf853352cfbcfea61884ff6a2ddcd2aee1ce8618589cf5b56912c1b160011
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cc13ac7175e5452c0a51656649ea1f0782593abf479b3fbadc42f21b535fdb6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed086eb7192e41567a97c6bf18d0c6b0652f463d79fa509a55c93668bd7f3655
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7771014117342313,
6
  "eval_steps": 100,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1478,6 +1478,425 @@
1478
  "eval_samples_per_second": 9.672,
1479
  "eval_steps_per_second": 1.212,
1480
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1481
  }
1482
  ],
1483
  "logging_steps": 10,
@@ -1492,12 +1911,12 @@
1492
  "should_evaluate": false,
1493
  "should_log": false,
1494
  "should_save": true,
1495
- "should_training_stop": false
1496
  },
1497
  "attributes": {}
1498
  }
1499
  },
1500
- "total_flos": 1.6237392297984e+16,
1501
  "train_batch_size": 1,
1502
  "trial_name": null,
1503
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9998704830980443,
6
  "eval_steps": 100,
7
+ "global_step": 1930,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1478
  "eval_samples_per_second": 9.672,
1479
  "eval_steps_per_second": 1.212,
1480
  "step": 1500
1481
+ },
1482
+ {
1483
+ "epoch": 0.7822820878124596,
1484
+ "grad_norm": 0.8270747065544128,
1485
+ "learning_rate": 1.2118595279217041e-05,
1486
+ "loss": 0.1642,
1487
+ "mean_token_accuracy": 0.969116922467947,
1488
+ "num_tokens": 6184960.0,
1489
+ "step": 1510
1490
+ },
1491
+ {
1492
+ "epoch": 0.7874627638906877,
1493
+ "grad_norm": 0.7555075883865356,
1494
+ "learning_rate": 1.1830742659758205e-05,
1495
+ "loss": 0.1521,
1496
+ "mean_token_accuracy": 0.9724070452153682,
1497
+ "num_tokens": 6225920.0,
1498
+ "step": 1520
1499
+ },
1500
+ {
1501
+ "epoch": 0.792643439968916,
1502
+ "grad_norm": 0.8719633221626282,
1503
+ "learning_rate": 1.1542890040299368e-05,
1504
+ "loss": 0.1639,
1505
+ "mean_token_accuracy": 0.9688356138765812,
1506
+ "num_tokens": 6266880.0,
1507
+ "step": 1530
1508
+ },
1509
+ {
1510
+ "epoch": 0.7978241160471442,
1511
+ "grad_norm": 0.792812705039978,
1512
+ "learning_rate": 1.125503742084053e-05,
1513
+ "loss": 0.1641,
1514
+ "mean_token_accuracy": 0.9699363961815834,
1515
+ "num_tokens": 6307840.0,
1516
+ "step": 1540
1517
+ },
1518
+ {
1519
+ "epoch": 0.8030047921253723,
1520
+ "grad_norm": 1.0832456350326538,
1521
+ "learning_rate": 1.0967184801381693e-05,
1522
+ "loss": 0.1441,
1523
+ "mean_token_accuracy": 0.9728962793946266,
1524
+ "num_tokens": 6348800.0,
1525
+ "step": 1550
1526
+ },
1527
+ {
1528
+ "epoch": 0.8081854682036006,
1529
+ "grad_norm": 0.8835451006889343,
1530
+ "learning_rate": 1.0679332181922857e-05,
1531
+ "loss": 0.1652,
1532
+ "mean_token_accuracy": 0.9685420744121075,
1533
+ "num_tokens": 6389760.0,
1534
+ "step": 1560
1535
+ },
1536
+ {
1537
+ "epoch": 0.8133661442818287,
1538
+ "grad_norm": 0.695384681224823,
1539
+ "learning_rate": 1.0391479562464019e-05,
1540
+ "loss": 0.1454,
1541
+ "mean_token_accuracy": 0.973140899837017,
1542
+ "num_tokens": 6430720.0,
1543
+ "step": 1570
1544
+ },
1545
+ {
1546
+ "epoch": 0.818546820360057,
1547
+ "grad_norm": 0.926196277141571,
1548
+ "learning_rate": 1.0103626943005182e-05,
1549
+ "loss": 0.1448,
1550
+ "mean_token_accuracy": 0.9730430491268635,
1551
+ "num_tokens": 6471680.0,
1552
+ "step": 1580
1553
+ },
1554
+ {
1555
+ "epoch": 0.8237274964382852,
1556
+ "grad_norm": 0.8786157369613647,
1557
+ "learning_rate": 9.815774323546346e-06,
1558
+ "loss": 0.1444,
1559
+ "mean_token_accuracy": 0.9727250434458256,
1560
+ "num_tokens": 6512640.0,
1561
+ "step": 1590
1562
+ },
1563
+ {
1564
+ "epoch": 0.8289081725165134,
1565
+ "grad_norm": 0.8193939328193665,
1566
+ "learning_rate": 9.527921704087508e-06,
1567
+ "loss": 0.1434,
1568
+ "step": 1600
1569
+ },
1570
+ {
1571
+ "epoch": 0.8289081725165134,
1572
+ "eval_loss": 0.15118131041526794,
1573
+ "eval_mean_token_accuracy": 0.9719678637593292,
1574
+ "eval_num_tokens": 6553600.0,
1575
+ "eval_runtime": 177.6288,
1576
+ "eval_samples_per_second": 9.661,
1577
+ "eval_steps_per_second": 1.21,
1578
+ "step": 1600
1579
+ },
1580
+ {
1581
+ "epoch": 0.8340888485947416,
1582
+ "grad_norm": 0.7355690002441406,
1583
+ "learning_rate": 9.240069084628671e-06,
1584
+ "loss": 0.1486,
1585
+ "mean_token_accuracy": 0.9729452040046453,
1586
+ "num_tokens": 6594560.0,
1587
+ "step": 1610
1588
+ },
1589
+ {
1590
+ "epoch": 0.8392695246729698,
1591
+ "grad_norm": 0.8735950589179993,
1592
+ "learning_rate": 8.952216465169835e-06,
1593
+ "loss": 0.1445,
1594
+ "mean_token_accuracy": 0.9733365938067436,
1595
+ "num_tokens": 6635520.0,
1596
+ "step": 1620
1597
+ },
1598
+ {
1599
+ "epoch": 0.844450200751198,
1600
+ "grad_norm": 1.1389552354812622,
1601
+ "learning_rate": 8.664363845710997e-06,
1602
+ "loss": 0.1558,
1603
+ "mean_token_accuracy": 0.9712817937135696,
1604
+ "num_tokens": 6676480.0,
1605
+ "step": 1630
1606
+ },
1607
+ {
1608
+ "epoch": 0.8496308768294263,
1609
+ "grad_norm": 0.8766786456108093,
1610
+ "learning_rate": 8.37651122625216e-06,
1611
+ "loss": 0.1558,
1612
+ "mean_token_accuracy": 0.9699853204190731,
1613
+ "num_tokens": 6717440.0,
1614
+ "step": 1640
1615
+ },
1616
+ {
1617
+ "epoch": 0.8548115529076544,
1618
+ "grad_norm": 0.8025283217430115,
1619
+ "learning_rate": 8.088658606793324e-06,
1620
+ "loss": 0.1561,
1621
+ "mean_token_accuracy": 0.9704500935971737,
1622
+ "num_tokens": 6758400.0,
1623
+ "step": 1650
1624
+ },
1625
+ {
1626
+ "epoch": 0.8599922289858827,
1627
+ "grad_norm": 0.7806901931762695,
1628
+ "learning_rate": 7.800805987334485e-06,
1629
+ "loss": 0.1635,
1630
+ "mean_token_accuracy": 0.969275925308466,
1631
+ "num_tokens": 6799360.0,
1632
+ "step": 1660
1633
+ },
1634
+ {
1635
+ "epoch": 0.8651729050641108,
1636
+ "grad_norm": 0.6943385601043701,
1637
+ "learning_rate": 7.512953367875648e-06,
1638
+ "loss": 0.1351,
1639
+ "mean_token_accuracy": 0.9746330663561821,
1640
+ "num_tokens": 6840320.0,
1641
+ "step": 1670
1642
+ },
1643
+ {
1644
+ "epoch": 0.8703535811423391,
1645
+ "grad_norm": 0.7692267894744873,
1646
+ "learning_rate": 7.225100748416811e-06,
1647
+ "loss": 0.15,
1648
+ "mean_token_accuracy": 0.9721134983003139,
1649
+ "num_tokens": 6881280.0,
1650
+ "step": 1680
1651
+ },
1652
+ {
1653
+ "epoch": 0.8755342572205673,
1654
+ "grad_norm": 1.0311888456344604,
1655
+ "learning_rate": 6.9372481289579734e-06,
1656
+ "loss": 0.1476,
1657
+ "mean_token_accuracy": 0.9735812105238437,
1658
+ "num_tokens": 6922240.0,
1659
+ "step": 1690
1660
+ },
1661
+ {
1662
+ "epoch": 0.8807149332987955,
1663
+ "grad_norm": 0.7127304673194885,
1664
+ "learning_rate": 6.649395509499137e-06,
1665
+ "loss": 0.1639,
1666
+ "step": 1700
1667
+ },
1668
+ {
1669
+ "epoch": 0.8807149332987955,
1670
+ "eval_loss": 0.15019147098064423,
1671
+ "eval_mean_token_accuracy": 0.972071400076844,
1672
+ "eval_num_tokens": 6963200.0,
1673
+ "eval_runtime": 177.5182,
1674
+ "eval_samples_per_second": 9.667,
1675
+ "eval_steps_per_second": 1.211,
1676
+ "step": 1700
1677
+ },
1678
+ {
1679
+ "epoch": 0.8858956093770237,
1680
+ "grad_norm": 0.8600668907165527,
1681
+ "learning_rate": 6.3615428900403e-06,
1682
+ "loss": 0.1562,
1683
+ "mean_token_accuracy": 0.9699975498020649,
1684
+ "num_tokens": 7004160.0,
1685
+ "step": 1710
1686
+ },
1687
+ {
1688
+ "epoch": 0.8910762854552519,
1689
+ "grad_norm": 0.8082440495491028,
1690
+ "learning_rate": 6.073690270581462e-06,
1691
+ "loss": 0.1587,
1692
+ "mean_token_accuracy": 0.9702299386262894,
1693
+ "num_tokens": 7045120.0,
1694
+ "step": 1720
1695
+ },
1696
+ {
1697
+ "epoch": 0.8962569615334801,
1698
+ "grad_norm": 0.8098168969154358,
1699
+ "learning_rate": 5.785837651122626e-06,
1700
+ "loss": 0.1421,
1701
+ "mean_token_accuracy": 0.9735078245401383,
1702
+ "num_tokens": 7086080.0,
1703
+ "step": 1730
1704
+ },
1705
+ {
1706
+ "epoch": 0.9014376376117084,
1707
+ "grad_norm": 0.6847867965698242,
1708
+ "learning_rate": 5.4979850316637885e-06,
1709
+ "loss": 0.1435,
1710
+ "mean_token_accuracy": 0.9728962808847428,
1711
+ "num_tokens": 7127040.0,
1712
+ "step": 1740
1713
+ },
1714
+ {
1715
+ "epoch": 0.9066183136899365,
1716
+ "grad_norm": 1.0864291191101074,
1717
+ "learning_rate": 5.210132412204952e-06,
1718
+ "loss": 0.1471,
1719
+ "mean_token_accuracy": 0.9716976463794709,
1720
+ "num_tokens": 7168000.0,
1721
+ "step": 1750
1722
+ },
1723
+ {
1724
+ "epoch": 0.9117989897681648,
1725
+ "grad_norm": 0.7632136344909668,
1726
+ "learning_rate": 4.922279792746115e-06,
1727
+ "loss": 0.1421,
1728
+ "mean_token_accuracy": 0.9734344378113746,
1729
+ "num_tokens": 7208960.0,
1730
+ "step": 1760
1731
+ },
1732
+ {
1733
+ "epoch": 0.9169796658463929,
1734
+ "grad_norm": 0.7627587914466858,
1735
+ "learning_rate": 4.634427173287277e-06,
1736
+ "loss": 0.155,
1737
+ "mean_token_accuracy": 0.970841483771801,
1738
+ "num_tokens": 7249920.0,
1739
+ "step": 1770
1740
+ },
1741
+ {
1742
+ "epoch": 0.9221603419246212,
1743
+ "grad_norm": 0.8158827424049377,
1744
+ "learning_rate": 4.34657455382844e-06,
1745
+ "loss": 0.1516,
1746
+ "mean_token_accuracy": 0.9719178065657615,
1747
+ "num_tokens": 7290880.0,
1748
+ "step": 1780
1749
+ },
1750
+ {
1751
+ "epoch": 0.9273410180028494,
1752
+ "grad_norm": 0.7051241397857666,
1753
+ "learning_rate": 4.058721934369604e-06,
1754
+ "loss": 0.155,
1755
+ "mean_token_accuracy": 0.9710616424679757,
1756
+ "num_tokens": 7331840.0,
1757
+ "step": 1790
1758
+ },
1759
+ {
1760
+ "epoch": 0.9325216940810775,
1761
+ "grad_norm": 0.9571183919906616,
1762
+ "learning_rate": 3.770869314910766e-06,
1763
+ "loss": 0.1523,
1764
+ "step": 1800
1765
+ },
1766
+ {
1767
+ "epoch": 0.9325216940810775,
1768
+ "eval_loss": 0.1495211273431778,
1769
+ "eval_mean_token_accuracy": 0.9722500282664631,
1770
+ "eval_num_tokens": 7372800.0,
1771
+ "eval_runtime": 177.4195,
1772
+ "eval_samples_per_second": 9.672,
1773
+ "eval_steps_per_second": 1.212,
1774
+ "step": 1800
1775
+ },
1776
+ {
1777
+ "epoch": 0.9377023701593058,
1778
+ "grad_norm": 0.7204054594039917,
1779
+ "learning_rate": 3.4830166954519285e-06,
1780
+ "loss": 0.147,
1781
+ "mean_token_accuracy": 0.9728962782770395,
1782
+ "num_tokens": 7413760.0,
1783
+ "step": 1810
1784
+ },
1785
+ {
1786
+ "epoch": 0.942883046237534,
1787
+ "grad_norm": 0.7952613830566406,
1788
+ "learning_rate": 3.1951640759930916e-06,
1789
+ "loss": 0.1379,
1790
+ "mean_token_accuracy": 0.9747309163212776,
1791
+ "num_tokens": 7454720.0,
1792
+ "step": 1820
1793
+ },
1794
+ {
1795
+ "epoch": 0.9480637223157622,
1796
+ "grad_norm": 0.6960965991020203,
1797
+ "learning_rate": 2.9073114565342547e-06,
1798
+ "loss": 0.1439,
1799
+ "mean_token_accuracy": 0.9738992169499397,
1800
+ "num_tokens": 7495680.0,
1801
+ "step": 1830
1802
+ },
1803
+ {
1804
+ "epoch": 0.9532443983939904,
1805
+ "grad_norm": 0.6948501467704773,
1806
+ "learning_rate": 2.619458837075418e-06,
1807
+ "loss": 0.1472,
1808
+ "mean_token_accuracy": 0.9721868857741356,
1809
+ "num_tokens": 7536640.0,
1810
+ "step": 1840
1811
+ },
1812
+ {
1813
+ "epoch": 0.9584250744722186,
1814
+ "grad_norm": 1.1260844469070435,
1815
+ "learning_rate": 2.3316062176165805e-06,
1816
+ "loss": 0.1595,
1817
+ "mean_token_accuracy": 0.9702788606286049,
1818
+ "num_tokens": 7577600.0,
1819
+ "step": 1850
1820
+ },
1821
+ {
1822
+ "epoch": 0.9636057505504468,
1823
+ "grad_norm": 0.8835856914520264,
1824
+ "learning_rate": 2.0437535981577436e-06,
1825
+ "loss": 0.1555,
1826
+ "mean_token_accuracy": 0.9711839489638805,
1827
+ "num_tokens": 7618560.0,
1828
+ "step": 1860
1829
+ },
1830
+ {
1831
+ "epoch": 0.9687864266286751,
1832
+ "grad_norm": 0.7467979788780212,
1833
+ "learning_rate": 1.755900978698906e-06,
1834
+ "loss": 0.1706,
1835
+ "mean_token_accuracy": 0.9686643823981285,
1836
+ "num_tokens": 7659520.0,
1837
+ "step": 1870
1838
+ },
1839
+ {
1840
+ "epoch": 0.9739671027069032,
1841
+ "grad_norm": 1.7198349237442017,
1842
+ "learning_rate": 1.4680483592400692e-06,
1843
+ "loss": 0.1638,
1844
+ "mean_token_accuracy": 0.9698630094528198,
1845
+ "num_tokens": 7700480.0,
1846
+ "step": 1880
1847
+ },
1848
+ {
1849
+ "epoch": 0.9791477787851315,
1850
+ "grad_norm": 0.6408202052116394,
1851
+ "learning_rate": 1.180195739781232e-06,
1852
+ "loss": 0.1508,
1853
+ "mean_token_accuracy": 0.9718444183468818,
1854
+ "num_tokens": 7741440.0,
1855
+ "step": 1890
1856
+ },
1857
+ {
1858
+ "epoch": 0.9843284548633596,
1859
+ "grad_norm": 0.810226321220398,
1860
+ "learning_rate": 8.92343120322395e-07,
1861
+ "loss": 0.1443,
1862
+ "step": 1900
1863
+ },
1864
+ {
1865
+ "epoch": 0.9843284548633596,
1866
+ "eval_loss": 0.14907296001911163,
1867
+ "eval_mean_token_accuracy": 0.9723376360050467,
1868
+ "eval_num_tokens": 7782400.0,
1869
+ "eval_runtime": 177.4193,
1870
+ "eval_samples_per_second": 9.672,
1871
+ "eval_steps_per_second": 1.212,
1872
+ "step": 1900
1873
+ },
1874
+ {
1875
+ "epoch": 0.9895091309415879,
1876
+ "grad_norm": 0.8254349231719971,
1877
+ "learning_rate": 6.04490500863558e-07,
1878
+ "loss": 0.145,
1879
+ "mean_token_accuracy": 0.9729085095226765,
1880
+ "num_tokens": 7823360.0,
1881
+ "step": 1910
1882
+ },
1883
+ {
1884
+ "epoch": 0.9946898070198161,
1885
+ "grad_norm": 0.7384099364280701,
1886
+ "learning_rate": 3.166378814047208e-07,
1887
+ "loss": 0.1438,
1888
+ "mean_token_accuracy": 0.9729941256344319,
1889
+ "num_tokens": 7864320.0,
1890
+ "step": 1920
1891
+ },
1892
+ {
1893
+ "epoch": 0.9998704830980443,
1894
+ "grad_norm": 0.8660192489624023,
1895
+ "learning_rate": 2.878526194588371e-08,
1896
+ "loss": 0.1535,
1897
+ "mean_token_accuracy": 0.971844420582056,
1898
+ "num_tokens": 7905280.0,
1899
+ "step": 1930
1900
  }
1901
  ],
1902
  "logging_steps": 10,
 
1911
  "should_evaluate": false,
1912
  "should_log": false,
1913
  "should_save": true,
1914
+ "should_training_stop": true
1915
  },
1916
  "attributes": {}
1917
  }
1918
  },
1919
+ "total_flos": 2.089211142340608e+16,
1920
  "train_batch_size": 1,
1921
  "trial_name": null,
1922
  "trial_params": null