rovdetection commited on
Commit
fd9d772
·
verified ·
1 Parent(s): d1a8c69

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac5c8a64c0ab83c2d761c379a2246a2a1b527485d9e6ca3902af906505086931
3
  size 9446744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdf2871a23de26395412fbb80cd5cfc6261483030011b659b66248a001490ba5
3
  size 9446744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7ff189dbe667133c77befb63ee7a1edfa6003ef9d12584ea250122201496d73
3
  size 4879947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13d0d3ac532ad9924ef2b3bb9206e041a19d9bb2aae0a0f9b0e9fb94268b3e2f
3
  size 4879947
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de0306e3417ea954b631b3708345453027eb0606d563ed02ac6f44b43062d77b
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96666620a506272b19319944e27b166707266143df40b9e008c7e67e99eb3d33
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6658911e5c000a5756986f0311bc01884aab6a488b04caad451226050b401b09
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3ae1599c24410db8dc749055bc50d225b3704ca4ce296c6043ed130093cd3d
3
  size 14917
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca372268f4fa9335030c0cb7aedb6cdba75f457da50e7a4034abb1a2d0843689
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4aa03f6e0cd07cf67ce1fbe3101d545f5771ef9148b9debf02b11cf6948da5c
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae3874a248c143370ff82b050c4843d50a75c26dfebab3f0c39c4f64277b398a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa56fa8fa334bce407f019356c2a989207ab5f10b19e9753e7cbc5ea11bcd4ec
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.577691811734365,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1508,6 +1508,506 @@
1508
  "mean_token_accuracy": 0.6264939974993468,
1509
  "num_tokens": 8909416.0,
1510
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1511
  }
1512
  ],
1513
  "logging_steps": 10,
@@ -1527,7 +2027,7 @@
1527
  "attributes": {}
1528
  }
1529
  },
1530
- "total_flos": 7.314786665417933e+16,
1531
  "train_batch_size": 2,
1532
  "trial_name": null,
1533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.436707500537288,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1508
  "mean_token_accuracy": 0.6264939974993468,
1509
  "num_tokens": 8909416.0,
1510
  "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 1.8110749498009682,
1514
+ "epoch": 2.5948850204169354,
1515
+ "grad_norm": 0.7108538746833801,
1516
+ "learning_rate": 0.00013964,
1517
+ "loss": 1.8952640533447265,
1518
+ "mean_token_accuracy": 0.6537120632827282,
1519
+ "num_tokens": 8968510.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 1.977073846757412,
1524
+ "epoch": 2.612078229099506,
1525
+ "grad_norm": 0.7554802298545837,
1526
+ "learning_rate": 0.00013924,
1527
+ "loss": 2.0621898651123045,
1528
+ "mean_token_accuracy": 0.6327366977930069,
1529
+ "num_tokens": 9026884.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 1.8783492282032968,
1534
+ "epoch": 2.629271437782076,
1535
+ "grad_norm": 0.6592015027999878,
1536
+ "learning_rate": 0.00013884000000000002,
1537
+ "loss": 1.9230785369873047,
1538
+ "mean_token_accuracy": 0.6494536675512791,
1539
+ "num_tokens": 9085571.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 1.9282778173685073,
1544
+ "epoch": 2.6464646464646466,
1545
+ "grad_norm": 0.7717080116271973,
1546
+ "learning_rate": 0.00013844,
1547
+ "loss": 2.0319377899169924,
1548
+ "mean_token_accuracy": 0.6344667036086321,
1549
+ "num_tokens": 9147549.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 1.903467869758606,
1554
+ "epoch": 2.6636578551472168,
1555
+ "grad_norm": 0.6227516531944275,
1556
+ "learning_rate": 0.00013804000000000003,
1557
+ "loss": 1.9306724548339844,
1558
+ "mean_token_accuracy": 0.644033481925726,
1559
+ "num_tokens": 9204942.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 1.8967040538787843,
1564
+ "epoch": 2.6808510638297873,
1565
+ "grad_norm": 0.6684938073158264,
1566
+ "learning_rate": 0.00013764000000000002,
1567
+ "loss": 2.001560592651367,
1568
+ "mean_token_accuracy": 0.6470274899154902,
1569
+ "num_tokens": 9266446.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 1.8590586185455322,
1574
+ "epoch": 2.6980442725123575,
1575
+ "grad_norm": 0.6150694489479065,
1576
+ "learning_rate": 0.00013724,
1577
+ "loss": 1.9280338287353516,
1578
+ "mean_token_accuracy": 0.6484670951962471,
1579
+ "num_tokens": 9326109.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 1.9293041676282883,
1584
+ "epoch": 2.715237481194928,
1585
+ "grad_norm": 0.6057704091072083,
1586
+ "learning_rate": 0.00013684000000000002,
1587
+ "loss": 1.9943519592285157,
1588
+ "mean_token_accuracy": 0.6371258046478033,
1589
+ "num_tokens": 9385073.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 1.8843669161200522,
1594
+ "epoch": 2.732430689877498,
1595
+ "grad_norm": 0.6834639310836792,
1596
+ "learning_rate": 0.00013644000000000002,
1597
+ "loss": 1.9569879531860352,
1598
+ "mean_token_accuracy": 0.6437417894601822,
1599
+ "num_tokens": 9445137.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 1.8529930964112282,
1604
+ "epoch": 2.7496238985600687,
1605
+ "grad_norm": 0.6442180871963501,
1606
+ "learning_rate": 0.00013604,
1607
+ "loss": 1.8902450561523438,
1608
+ "mean_token_accuracy": 0.6518216013908387,
1609
+ "num_tokens": 9504160.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 1.939158782362938,
1614
+ "epoch": 2.7668171072426393,
1615
+ "grad_norm": 0.6240729689598083,
1616
+ "learning_rate": 0.00013564000000000002,
1617
+ "loss": 2.0188575744628907,
1618
+ "mean_token_accuracy": 0.63564417026937,
1619
+ "num_tokens": 9564675.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 1.9281259045004844,
1624
+ "epoch": 2.7840103159252094,
1625
+ "grad_norm": 0.750890851020813,
1626
+ "learning_rate": 0.00013524,
1627
+ "loss": 2.017038345336914,
1628
+ "mean_token_accuracy": 0.6387452960014344,
1629
+ "num_tokens": 9625026.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 1.873080413043499,
1634
+ "epoch": 2.80120352460778,
1635
+ "grad_norm": 0.776397168636322,
1636
+ "learning_rate": 0.00013484,
1637
+ "loss": 1.9759422302246095,
1638
+ "mean_token_accuracy": 0.6433901283890009,
1639
+ "num_tokens": 9685967.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 1.9089648619294166,
1644
+ "epoch": 2.81839673329035,
1645
+ "grad_norm": 0.6481618881225586,
1646
+ "learning_rate": 0.00013444000000000002,
1647
+ "loss": 1.956050491333008,
1648
+ "mean_token_accuracy": 0.6402542922645807,
1649
+ "num_tokens": 9745233.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 1.975960558652878,
1654
+ "epoch": 2.8355899419729207,
1655
+ "grad_norm": 0.6896694302558899,
1656
+ "learning_rate": 0.00013404,
1657
+ "loss": 2.0583721160888673,
1658
+ "mean_token_accuracy": 0.6340504981577396,
1659
+ "num_tokens": 9805150.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 1.945571132004261,
1664
+ "epoch": 2.8527831506554913,
1665
+ "grad_norm": 0.6386220455169678,
1666
+ "learning_rate": 0.00013364,
1667
+ "loss": 2.03116512298584,
1668
+ "mean_token_accuracy": 0.6365220382809639,
1669
+ "num_tokens": 9861196.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 1.9110410138964653,
1674
+ "epoch": 2.8699763593380614,
1675
+ "grad_norm": 0.7503199577331543,
1676
+ "learning_rate": 0.00013324000000000002,
1677
+ "loss": 1.9521196365356446,
1678
+ "mean_token_accuracy": 0.6381696432828903,
1679
+ "num_tokens": 9921155.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 1.849820225685835,
1684
+ "epoch": 2.887169568020632,
1685
+ "grad_norm": 0.6197855472564697,
1686
+ "learning_rate": 0.00013284,
1687
+ "loss": 1.8909440994262696,
1688
+ "mean_token_accuracy": 0.6426266122609376,
1689
+ "num_tokens": 9979351.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 1.8932805389165879,
1694
+ "epoch": 2.904362776703202,
1695
+ "grad_norm": 0.6703120470046997,
1696
+ "learning_rate": 0.00013244,
1697
+ "loss": 2.0233718872070314,
1698
+ "mean_token_accuracy": 0.646468547359109,
1699
+ "num_tokens": 10041238.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 1.8625088930130005,
1704
+ "epoch": 2.9215559853857727,
1705
+ "grad_norm": 0.73073410987854,
1706
+ "learning_rate": 0.00013204000000000002,
1707
+ "loss": 1.9317462921142579,
1708
+ "mean_token_accuracy": 0.6454428397119045,
1709
+ "num_tokens": 10099496.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 1.9354272544384004,
1714
+ "epoch": 2.9387491940683432,
1715
+ "grad_norm": 0.6566579937934875,
1716
+ "learning_rate": 0.00013164,
1717
+ "loss": 2.0027164459228515,
1718
+ "mean_token_accuracy": 0.6403926335275173,
1719
+ "num_tokens": 10161720.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 1.88578300178051,
1724
+ "epoch": 2.9559424027509134,
1725
+ "grad_norm": 0.7905747890472412,
1726
+ "learning_rate": 0.00013124,
1727
+ "loss": 1.9767372131347656,
1728
+ "mean_token_accuracy": 0.6421503167599439,
1729
+ "num_tokens": 10221734.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 1.870301403105259,
1734
+ "epoch": 2.973135611433484,
1735
+ "grad_norm": 0.7210419774055481,
1736
+ "learning_rate": 0.00013084000000000001,
1737
+ "loss": 1.9475433349609375,
1738
+ "mean_token_accuracy": 0.6528905872255564,
1739
+ "num_tokens": 10280223.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 1.8696911588311196,
1744
+ "epoch": 2.990328820116054,
1745
+ "grad_norm": 0.626354992389679,
1746
+ "learning_rate": 0.00013044,
1747
+ "loss": 1.926706314086914,
1748
+ "mean_token_accuracy": 0.6482070714235306,
1749
+ "num_tokens": 10339813.0,
1750
+ "step": 1740
1751
+ },
1752
+ {
1753
+ "entropy": 1.821205088844547,
1754
+ "epoch": 3.006877283473028,
1755
+ "grad_norm": 0.6353569030761719,
1756
+ "learning_rate": 0.00013004,
1757
+ "loss": 1.8657075881958007,
1758
+ "mean_token_accuracy": 0.6556356762136731,
1759
+ "num_tokens": 10398519.0,
1760
+ "step": 1750
1761
+ },
1762
+ {
1763
+ "entropy": 1.8890676617622375,
1764
+ "epoch": 3.0240704921555985,
1765
+ "grad_norm": 0.783729076385498,
1766
+ "learning_rate": 0.00012964,
1767
+ "loss": 1.9794137954711915,
1768
+ "mean_token_accuracy": 0.643126554042101,
1769
+ "num_tokens": 10456386.0,
1770
+ "step": 1760
1771
+ },
1772
+ {
1773
+ "entropy": 1.8766882956027984,
1774
+ "epoch": 3.041263700838169,
1775
+ "grad_norm": 0.7075045108795166,
1776
+ "learning_rate": 0.00012924,
1777
+ "loss": 1.9388771057128906,
1778
+ "mean_token_accuracy": 0.6462941512465477,
1779
+ "num_tokens": 10516721.0,
1780
+ "step": 1770
1781
+ },
1782
+ {
1783
+ "entropy": 1.7985384911298752,
1784
+ "epoch": 3.0584569095207392,
1785
+ "grad_norm": 0.7116262912750244,
1786
+ "learning_rate": 0.00012884,
1787
+ "loss": 1.8379974365234375,
1788
+ "mean_token_accuracy": 0.6582404263317585,
1789
+ "num_tokens": 10575553.0,
1790
+ "step": 1780
1791
+ },
1792
+ {
1793
+ "entropy": 1.8475583091378212,
1794
+ "epoch": 3.07565011820331,
1795
+ "grad_norm": 0.69736248254776,
1796
+ "learning_rate": 0.00012844,
1797
+ "loss": 1.9197765350341798,
1798
+ "mean_token_accuracy": 0.6509403776377439,
1799
+ "num_tokens": 10632501.0,
1800
+ "step": 1790
1801
+ },
1802
+ {
1803
+ "entropy": 1.8264927819371224,
1804
+ "epoch": 3.09284332688588,
1805
+ "grad_norm": 0.6354222297668457,
1806
+ "learning_rate": 0.00012804,
1807
+ "loss": 1.8965986251831055,
1808
+ "mean_token_accuracy": 0.6518782209604979,
1809
+ "num_tokens": 10693167.0,
1810
+ "step": 1800
1811
+ },
1812
+ {
1813
+ "entropy": 1.8696907818317414,
1814
+ "epoch": 3.1100365355684505,
1815
+ "grad_norm": 0.7568804621696472,
1816
+ "learning_rate": 0.00012764,
1817
+ "loss": 1.9332853317260743,
1818
+ "mean_token_accuracy": 0.6471077598631382,
1819
+ "num_tokens": 10753837.0,
1820
+ "step": 1810
1821
+ },
1822
+ {
1823
+ "entropy": 1.886954003572464,
1824
+ "epoch": 3.1272297442510206,
1825
+ "grad_norm": 0.7069846391677856,
1826
+ "learning_rate": 0.00012724,
1827
+ "loss": 1.9263908386230468,
1828
+ "mean_token_accuracy": 0.6466126769781113,
1829
+ "num_tokens": 10815256.0,
1830
+ "step": 1820
1831
+ },
1832
+ {
1833
+ "entropy": 1.8424360305070877,
1834
+ "epoch": 3.144422952933591,
1835
+ "grad_norm": 0.6524083614349365,
1836
+ "learning_rate": 0.00012684,
1837
+ "loss": 1.9088315963745117,
1838
+ "mean_token_accuracy": 0.6496367674320936,
1839
+ "num_tokens": 10877848.0,
1840
+ "step": 1830
1841
+ },
1842
+ {
1843
+ "entropy": 1.8966794192790986,
1844
+ "epoch": 3.1616161616161618,
1845
+ "grad_norm": 0.687421977519989,
1846
+ "learning_rate": 0.00012644000000000002,
1847
+ "loss": 1.9748069763183593,
1848
+ "mean_token_accuracy": 0.6424707356840372,
1849
+ "num_tokens": 10938042.0,
1850
+ "step": 1840
1851
+ },
1852
+ {
1853
+ "entropy": 1.81406429708004,
1854
+ "epoch": 3.178809370298732,
1855
+ "grad_norm": 0.7668496370315552,
1856
+ "learning_rate": 0.00012604,
1857
+ "loss": 1.8712465286254882,
1858
+ "mean_token_accuracy": 0.6571074567735196,
1859
+ "num_tokens": 10996204.0,
1860
+ "step": 1850
1861
+ },
1862
+ {
1863
+ "entropy": 1.8159340515732765,
1864
+ "epoch": 3.1960025789813025,
1865
+ "grad_norm": 0.7182545065879822,
1866
+ "learning_rate": 0.00012564,
1867
+ "loss": 1.830276107788086,
1868
+ "mean_token_accuracy": 0.6546356856822968,
1869
+ "num_tokens": 11056605.0,
1870
+ "step": 1860
1871
+ },
1872
+ {
1873
+ "entropy": 1.9095668271183968,
1874
+ "epoch": 3.2131957876638726,
1875
+ "grad_norm": 0.7548812031745911,
1876
+ "learning_rate": 0.00012524000000000001,
1877
+ "loss": 1.998922348022461,
1878
+ "mean_token_accuracy": 0.6411306612193585,
1879
+ "num_tokens": 11116614.0,
1880
+ "step": 1870
1881
+ },
1882
+ {
1883
+ "entropy": 1.8717206478118897,
1884
+ "epoch": 3.230388996346443,
1885
+ "grad_norm": 0.7692223191261292,
1886
+ "learning_rate": 0.00012484,
1887
+ "loss": 1.914438247680664,
1888
+ "mean_token_accuracy": 0.6441164951771498,
1889
+ "num_tokens": 11175802.0,
1890
+ "step": 1880
1891
+ },
1892
+ {
1893
+ "entropy": 1.8943733513355254,
1894
+ "epoch": 3.2475822050290137,
1895
+ "grad_norm": 0.6439138650894165,
1896
+ "learning_rate": 0.00012444,
1897
+ "loss": 1.9280553817749024,
1898
+ "mean_token_accuracy": 0.6476396139711141,
1899
+ "num_tokens": 11236477.0,
1900
+ "step": 1890
1901
+ },
1902
+ {
1903
+ "entropy": 1.8841392308473588,
1904
+ "epoch": 3.264775413711584,
1905
+ "grad_norm": 0.6971343159675598,
1906
+ "learning_rate": 0.00012404,
1907
+ "loss": 1.942568588256836,
1908
+ "mean_token_accuracy": 0.6398356795310974,
1909
+ "num_tokens": 11295146.0,
1910
+ "step": 1900
1911
+ },
1912
+ {
1913
+ "entropy": 1.8830088019371032,
1914
+ "epoch": 3.2819686223941544,
1915
+ "grad_norm": 0.7196023464202881,
1916
+ "learning_rate": 0.00012364,
1917
+ "loss": 1.963007354736328,
1918
+ "mean_token_accuracy": 0.6452915534377098,
1919
+ "num_tokens": 11355726.0,
1920
+ "step": 1910
1921
+ },
1922
+ {
1923
+ "entropy": 1.927216087281704,
1924
+ "epoch": 3.2991618310767246,
1925
+ "grad_norm": 0.790634274482727,
1926
+ "learning_rate": 0.00012324,
1927
+ "loss": 2.0809165954589846,
1928
+ "mean_token_accuracy": 0.6384686015546321,
1929
+ "num_tokens": 11415237.0,
1930
+ "step": 1920
1931
+ },
1932
+ {
1933
+ "entropy": 1.849087017774582,
1934
+ "epoch": 3.316355039759295,
1935
+ "grad_norm": 0.6752087473869324,
1936
+ "learning_rate": 0.00012284,
1937
+ "loss": 1.9017595291137694,
1938
+ "mean_token_accuracy": 0.6522149413824081,
1939
+ "num_tokens": 11476337.0,
1940
+ "step": 1930
1941
+ },
1942
+ {
1943
+ "entropy": 1.8517325416207313,
1944
+ "epoch": 3.3335482484418657,
1945
+ "grad_norm": 0.8036973476409912,
1946
+ "learning_rate": 0.00012244,
1947
+ "loss": 1.9011222839355468,
1948
+ "mean_token_accuracy": 0.6499856971204281,
1949
+ "num_tokens": 11537529.0,
1950
+ "step": 1940
1951
+ },
1952
+ {
1953
+ "entropy": 1.7622334837913514,
1954
+ "epoch": 3.350741457124436,
1955
+ "grad_norm": 0.7138587832450867,
1956
+ "learning_rate": 0.00012204,
1957
+ "loss": 1.7955827713012695,
1958
+ "mean_token_accuracy": 0.6596556272357702,
1959
+ "num_tokens": 11595421.0,
1960
+ "step": 1950
1961
+ },
1962
+ {
1963
+ "entropy": 1.8950866341590882,
1964
+ "epoch": 3.3679346658070064,
1965
+ "grad_norm": 0.6869714260101318,
1966
+ "learning_rate": 0.00012164,
1967
+ "loss": 1.948552131652832,
1968
+ "mean_token_accuracy": 0.6493024453520775,
1969
+ "num_tokens": 11655749.0,
1970
+ "step": 1960
1971
+ },
1972
+ {
1973
+ "entropy": 1.9235218942165375,
1974
+ "epoch": 3.3851278744895765,
1975
+ "grad_norm": 0.656403124332428,
1976
+ "learning_rate": 0.00012124,
1977
+ "loss": 2.04327449798584,
1978
+ "mean_token_accuracy": 0.6389912366867065,
1979
+ "num_tokens": 11717271.0,
1980
+ "step": 1970
1981
+ },
1982
+ {
1983
+ "entropy": 1.834906594455242,
1984
+ "epoch": 3.402321083172147,
1985
+ "grad_norm": 0.7343699932098389,
1986
+ "learning_rate": 0.00012084,
1987
+ "loss": 1.9038848876953125,
1988
+ "mean_token_accuracy": 0.6569048661738635,
1989
+ "num_tokens": 11778095.0,
1990
+ "step": 1980
1991
+ },
1992
+ {
1993
+ "entropy": 1.8515655741095542,
1994
+ "epoch": 3.4195142918547172,
1995
+ "grad_norm": 0.7009745240211487,
1996
+ "learning_rate": 0.00012043999999999999,
1997
+ "loss": 1.9157728195190429,
1998
+ "mean_token_accuracy": 0.6512683361768723,
1999
+ "num_tokens": 11835954.0,
2000
+ "step": 1990
2001
+ },
2002
+ {
2003
+ "entropy": 1.8634012743830681,
2004
+ "epoch": 3.436707500537288,
2005
+ "grad_norm": 0.6880552172660828,
2006
+ "learning_rate": 0.00012004,
2007
+ "loss": 1.9772762298583983,
2008
+ "mean_token_accuracy": 0.6531724959611893,
2009
+ "num_tokens": 11896615.0,
2010
+ "step": 2000
2011
  }
2012
  ],
2013
  "logging_steps": 10,
 
2027
  "attributes": {}
2028
  }
2029
  },
2030
+ "total_flos": 9.772738986953933e+16,
2031
  "train_batch_size": 2,
2032
  "trial_name": null,
2033
  "trial_params": null