JustinDuc commited on
Commit
754e28f
·
verified ·
1 Parent(s): 5804cb1

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5055764d8bf73f79ca7a428dd271da991268e85a2d9756a29b72f1bbefb9104a
3
  size 560983656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8e6799b936bbb72f55d38f901d37d73eb48aa2c7687d5081feb69086f1ca29
3
  size 560983656
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f66e663ef1782e6f78f2a436a2d29f26ecfb1c44e1adbfb0b37aba90dd2d164
3
  size 246053739
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8459e376ea931998d33c038080ed53ca14e8b5c5ec8f7e930035f963ea0160eb
3
  size 246053739
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97ee020279a22b6910a66d68f6a36548720e0f741299e197d68a0300097035d6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:713b57f62ca2b5f71e37e51f362f0a8f37b5504e2e104de0c5deeb68a0b57d4c
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0153ef62604f0a19d6db352c0efdd59e737e890fcf932261c593f4cb5173e799
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:581a3b97e73590097d6ffd55f019c06921d3c41b5bf808069d22c75dbddbf820
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2715b803771bb9f0f36fe0c01e062993d36b56f916510bbea310e555c567b4cf
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b24df43f2a8c3c00021031b8f6e5b0ecb8e2da1059013876673704ecfc7bfbf7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.008392204648945687,
6
  "eval_steps": 50,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1608,6 +1608,1606 @@
1608
  "eval_samples_per_second": 5.614,
1609
  "eval_steps_per_second": 2.245,
1610
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1611
  }
1612
  ],
1613
  "logging_steps": 50,
@@ -1627,7 +3227,7 @@
1627
  "attributes": {}
1628
  }
1629
  },
1630
- "total_flos": 8043289130117280.0,
1631
  "train_batch_size": 2,
1632
  "trial_name": null,
1633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.016784409297891375,
6
  "eval_steps": 50,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1608
  "eval_samples_per_second": 5.614,
1609
  "eval_steps_per_second": 2.245,
1610
  "step": 5000
1611
+ },
1612
+ {
1613
+ "epoch": 0.008476126695435144,
1614
+ "grad_norm": 6.474329471588135,
1615
+ "learning_rate": 4.9576445431367715e-05,
1616
+ "loss": 1.8228,
1617
+ "step": 5050
1618
+ },
1619
+ {
1620
+ "epoch": 0.008476126695435144,
1621
+ "eval_loss": 1.4911173582077026,
1622
+ "eval_masked_accuracy": 0.71875,
1623
+ "eval_runtime": 1.8052,
1624
+ "eval_samples_per_second": 5.54,
1625
+ "eval_steps_per_second": 2.216,
1626
+ "step": 5050
1627
+ },
1628
+ {
1629
+ "epoch": 0.0085600487419246,
1630
+ "grad_norm": 4.493051052093506,
1631
+ "learning_rate": 4.9572249329043244e-05,
1632
+ "loss": 1.5526,
1633
+ "step": 5100
1634
+ },
1635
+ {
1636
+ "epoch": 0.0085600487419246,
1637
+ "eval_loss": 1.4060901403427124,
1638
+ "eval_masked_accuracy": 0.7131474018096924,
1639
+ "eval_runtime": 1.8193,
1640
+ "eval_samples_per_second": 5.497,
1641
+ "eval_steps_per_second": 2.199,
1642
+ "step": 5100
1643
+ },
1644
+ {
1645
+ "epoch": 0.008643970788414057,
1646
+ "grad_norm": 5.657381057739258,
1647
+ "learning_rate": 4.956805322671877e-05,
1648
+ "loss": 1.5743,
1649
+ "step": 5150
1650
+ },
1651
+ {
1652
+ "epoch": 0.008643970788414057,
1653
+ "eval_loss": 1.7347627878189087,
1654
+ "eval_masked_accuracy": 0.6392694115638733,
1655
+ "eval_runtime": 1.7632,
1656
+ "eval_samples_per_second": 5.671,
1657
+ "eval_steps_per_second": 2.269,
1658
+ "step": 5150
1659
+ },
1660
+ {
1661
+ "epoch": 0.008727892834903515,
1662
+ "grad_norm": 5.059664726257324,
1663
+ "learning_rate": 4.9563941046440784e-05,
1664
+ "loss": 1.5923,
1665
+ "step": 5200
1666
+ },
1667
+ {
1668
+ "epoch": 0.008727892834903515,
1669
+ "eval_loss": 1.7108001708984375,
1670
+ "eval_masked_accuracy": 0.6759999990463257,
1671
+ "eval_runtime": 1.7312,
1672
+ "eval_samples_per_second": 5.776,
1673
+ "eval_steps_per_second": 2.311,
1674
+ "step": 5200
1675
+ },
1676
+ {
1677
+ "epoch": 0.008811814881392972,
1678
+ "grad_norm": 6.256536483764648,
1679
+ "learning_rate": 4.955974494411631e-05,
1680
+ "loss": 1.5454,
1681
+ "step": 5250
1682
+ },
1683
+ {
1684
+ "epoch": 0.008811814881392972,
1685
+ "eval_loss": 1.8423763513565063,
1686
+ "eval_masked_accuracy": 0.6590909361839294,
1687
+ "eval_runtime": 1.7323,
1688
+ "eval_samples_per_second": 5.773,
1689
+ "eval_steps_per_second": 2.309,
1690
+ "step": 5250
1691
+ },
1692
+ {
1693
+ "epoch": 0.008895736927882428,
1694
+ "grad_norm": 6.45760440826416,
1695
+ "learning_rate": 4.955554884179184e-05,
1696
+ "loss": 1.5381,
1697
+ "step": 5300
1698
+ },
1699
+ {
1700
+ "epoch": 0.008895736927882428,
1701
+ "eval_loss": 1.8820030689239502,
1702
+ "eval_masked_accuracy": 0.6486486196517944,
1703
+ "eval_runtime": 1.7529,
1704
+ "eval_samples_per_second": 5.705,
1705
+ "eval_steps_per_second": 2.282,
1706
+ "step": 5300
1707
+ },
1708
+ {
1709
+ "epoch": 0.008979658974371885,
1710
+ "grad_norm": 7.668667793273926,
1711
+ "learning_rate": 4.955135273946737e-05,
1712
+ "loss": 1.6363,
1713
+ "step": 5350
1714
+ },
1715
+ {
1716
+ "epoch": 0.008979658974371885,
1717
+ "eval_loss": 1.631400465965271,
1718
+ "eval_masked_accuracy": 0.7160493731498718,
1719
+ "eval_runtime": 1.7511,
1720
+ "eval_samples_per_second": 5.711,
1721
+ "eval_steps_per_second": 2.284,
1722
+ "step": 5350
1723
+ },
1724
+ {
1725
+ "epoch": 0.009063581020861342,
1726
+ "grad_norm": 7.2050018310546875,
1727
+ "learning_rate": 4.954715663714289e-05,
1728
+ "loss": 1.5738,
1729
+ "step": 5400
1730
+ },
1731
+ {
1732
+ "epoch": 0.009063581020861342,
1733
+ "eval_loss": 1.5917881727218628,
1734
+ "eval_masked_accuracy": 0.7405857443809509,
1735
+ "eval_runtime": 1.75,
1736
+ "eval_samples_per_second": 5.714,
1737
+ "eval_steps_per_second": 2.286,
1738
+ "step": 5400
1739
+ },
1740
+ {
1741
+ "epoch": 0.0091475030673508,
1742
+ "grad_norm": 6.094969749450684,
1743
+ "learning_rate": 4.954296053481842e-05,
1744
+ "loss": 1.7321,
1745
+ "step": 5450
1746
+ },
1747
+ {
1748
+ "epoch": 0.0091475030673508,
1749
+ "eval_loss": 1.5327577590942383,
1750
+ "eval_masked_accuracy": 0.707317054271698,
1751
+ "eval_runtime": 1.7423,
1752
+ "eval_samples_per_second": 5.74,
1753
+ "eval_steps_per_second": 2.296,
1754
+ "step": 5450
1755
+ },
1756
+ {
1757
+ "epoch": 0.009231425113840256,
1758
+ "grad_norm": 8.869881629943848,
1759
+ "learning_rate": 4.953876443249395e-05,
1760
+ "loss": 1.5768,
1761
+ "step": 5500
1762
+ },
1763
+ {
1764
+ "epoch": 0.009231425113840256,
1765
+ "eval_loss": 1.3501726388931274,
1766
+ "eval_masked_accuracy": 0.7801724076271057,
1767
+ "eval_runtime": 1.7732,
1768
+ "eval_samples_per_second": 5.64,
1769
+ "eval_steps_per_second": 2.256,
1770
+ "step": 5500
1771
+ },
1772
+ {
1773
+ "epoch": 0.009315347160329713,
1774
+ "grad_norm": 4.408574104309082,
1775
+ "learning_rate": 4.9534568330169476e-05,
1776
+ "loss": 1.5802,
1777
+ "step": 5550
1778
+ },
1779
+ {
1780
+ "epoch": 0.009315347160329713,
1781
+ "eval_loss": 1.7055152654647827,
1782
+ "eval_masked_accuracy": 0.6707317233085632,
1783
+ "eval_runtime": 1.7418,
1784
+ "eval_samples_per_second": 5.741,
1785
+ "eval_steps_per_second": 2.296,
1786
+ "step": 5550
1787
+ },
1788
+ {
1789
+ "epoch": 0.00939926920681917,
1790
+ "grad_norm": 5.3869147300720215,
1791
+ "learning_rate": 4.9530372227845e-05,
1792
+ "loss": 1.5547,
1793
+ "step": 5600
1794
+ },
1795
+ {
1796
+ "epoch": 0.00939926920681917,
1797
+ "eval_loss": 1.3663699626922607,
1798
+ "eval_masked_accuracy": 0.6974790096282959,
1799
+ "eval_runtime": 1.7338,
1800
+ "eval_samples_per_second": 5.768,
1801
+ "eval_steps_per_second": 2.307,
1802
+ "step": 5600
1803
+ },
1804
+ {
1805
+ "epoch": 0.009483191253308626,
1806
+ "grad_norm": 4.417982578277588,
1807
+ "learning_rate": 4.9526176125520526e-05,
1808
+ "loss": 1.5658,
1809
+ "step": 5650
1810
+ },
1811
+ {
1812
+ "epoch": 0.009483191253308626,
1813
+ "eval_loss": 1.6572059392929077,
1814
+ "eval_masked_accuracy": 0.6520000100135803,
1815
+ "eval_runtime": 1.7583,
1816
+ "eval_samples_per_second": 5.687,
1817
+ "eval_steps_per_second": 2.275,
1818
+ "step": 5650
1819
+ },
1820
+ {
1821
+ "epoch": 0.009567113299798084,
1822
+ "grad_norm": 5.2137861251831055,
1823
+ "learning_rate": 4.9521980023196055e-05,
1824
+ "loss": 1.5929,
1825
+ "step": 5700
1826
+ },
1827
+ {
1828
+ "epoch": 0.009567113299798084,
1829
+ "eval_loss": 1.4574190378189087,
1830
+ "eval_masked_accuracy": 0.6694560647010803,
1831
+ "eval_runtime": 1.7352,
1832
+ "eval_samples_per_second": 5.763,
1833
+ "eval_steps_per_second": 2.305,
1834
+ "step": 5700
1835
+ },
1836
+ {
1837
+ "epoch": 0.00965103534628754,
1838
+ "grad_norm": 6.848864555358887,
1839
+ "learning_rate": 4.951778392087158e-05,
1840
+ "loss": 1.6008,
1841
+ "step": 5750
1842
+ },
1843
+ {
1844
+ "epoch": 0.00965103534628754,
1845
+ "eval_loss": 2.133417844772339,
1846
+ "eval_masked_accuracy": 0.6540084481239319,
1847
+ "eval_runtime": 1.8568,
1848
+ "eval_samples_per_second": 5.386,
1849
+ "eval_steps_per_second": 2.154,
1850
+ "step": 5750
1851
+ },
1852
+ {
1853
+ "epoch": 0.009734957392776997,
1854
+ "grad_norm": 3.9827840328216553,
1855
+ "learning_rate": 4.9513587818547105e-05,
1856
+ "loss": 1.5811,
1857
+ "step": 5800
1858
+ },
1859
+ {
1860
+ "epoch": 0.009734957392776997,
1861
+ "eval_loss": 1.403198003768921,
1862
+ "eval_masked_accuracy": 0.7085201740264893,
1863
+ "eval_runtime": 1.749,
1864
+ "eval_samples_per_second": 5.717,
1865
+ "eval_steps_per_second": 2.287,
1866
+ "step": 5800
1867
+ },
1868
+ {
1869
+ "epoch": 0.009818879439266454,
1870
+ "grad_norm": 4.541887283325195,
1871
+ "learning_rate": 4.950939171622263e-05,
1872
+ "loss": 1.558,
1873
+ "step": 5850
1874
+ },
1875
+ {
1876
+ "epoch": 0.009818879439266454,
1877
+ "eval_loss": 1.4281632900238037,
1878
+ "eval_masked_accuracy": 0.7195122241973877,
1879
+ "eval_runtime": 1.7523,
1880
+ "eval_samples_per_second": 5.707,
1881
+ "eval_steps_per_second": 2.283,
1882
+ "step": 5850
1883
+ },
1884
+ {
1885
+ "epoch": 0.00990280148575591,
1886
+ "grad_norm": 8.121429443359375,
1887
+ "learning_rate": 4.950519561389816e-05,
1888
+ "loss": 1.5583,
1889
+ "step": 5900
1890
+ },
1891
+ {
1892
+ "epoch": 0.00990280148575591,
1893
+ "eval_loss": 1.608547568321228,
1894
+ "eval_masked_accuracy": 0.6582278609275818,
1895
+ "eval_runtime": 1.7405,
1896
+ "eval_samples_per_second": 5.745,
1897
+ "eval_steps_per_second": 2.298,
1898
+ "step": 5900
1899
+ },
1900
+ {
1901
+ "epoch": 0.009986723532245369,
1902
+ "grad_norm": 4.750977039337158,
1903
+ "learning_rate": 4.950099951157369e-05,
1904
+ "loss": 1.5378,
1905
+ "step": 5950
1906
+ },
1907
+ {
1908
+ "epoch": 0.009986723532245369,
1909
+ "eval_loss": 1.3912121057510376,
1910
+ "eval_masked_accuracy": 0.701298713684082,
1911
+ "eval_runtime": 1.7623,
1912
+ "eval_samples_per_second": 5.674,
1913
+ "eval_steps_per_second": 2.27,
1914
+ "step": 5950
1915
+ },
1916
+ {
1917
+ "epoch": 0.010070645578734825,
1918
+ "grad_norm": 4.445640563964844,
1919
+ "learning_rate": 4.949680340924922e-05,
1920
+ "loss": 1.5063,
1921
+ "step": 6000
1922
+ },
1923
+ {
1924
+ "epoch": 0.010070645578734825,
1925
+ "eval_loss": 1.6513465642929077,
1926
+ "eval_masked_accuracy": 0.6796537041664124,
1927
+ "eval_runtime": 1.7424,
1928
+ "eval_samples_per_second": 5.739,
1929
+ "eval_steps_per_second": 2.296,
1930
+ "step": 6000
1931
+ },
1932
+ {
1933
+ "epoch": 0.010154567625224282,
1934
+ "grad_norm": 13.394184112548828,
1935
+ "learning_rate": 4.949260730692475e-05,
1936
+ "loss": 1.5155,
1937
+ "step": 6050
1938
+ },
1939
+ {
1940
+ "epoch": 0.010154567625224282,
1941
+ "eval_loss": 1.5842430591583252,
1942
+ "eval_masked_accuracy": 0.6853448152542114,
1943
+ "eval_runtime": 1.7416,
1944
+ "eval_samples_per_second": 5.742,
1945
+ "eval_steps_per_second": 2.297,
1946
+ "step": 6050
1947
+ },
1948
+ {
1949
+ "epoch": 0.010238489671713738,
1950
+ "grad_norm": 7.441386699676514,
1951
+ "learning_rate": 4.948841120460027e-05,
1952
+ "loss": 1.5009,
1953
+ "step": 6100
1954
+ },
1955
+ {
1956
+ "epoch": 0.010238489671713738,
1957
+ "eval_loss": 1.512109637260437,
1958
+ "eval_masked_accuracy": 0.6987447738647461,
1959
+ "eval_runtime": 1.7546,
1960
+ "eval_samples_per_second": 5.699,
1961
+ "eval_steps_per_second": 2.28,
1962
+ "step": 6100
1963
+ },
1964
+ {
1965
+ "epoch": 0.010322411718203195,
1966
+ "grad_norm": 6.1988749504089355,
1967
+ "learning_rate": 4.94842151022758e-05,
1968
+ "loss": 1.5567,
1969
+ "step": 6150
1970
+ },
1971
+ {
1972
+ "epoch": 0.010322411718203195,
1973
+ "eval_loss": 1.5210555791854858,
1974
+ "eval_masked_accuracy": 0.7109375,
1975
+ "eval_runtime": 1.7524,
1976
+ "eval_samples_per_second": 5.707,
1977
+ "eval_steps_per_second": 2.283,
1978
+ "step": 6150
1979
+ },
1980
+ {
1981
+ "epoch": 0.010406333764692651,
1982
+ "grad_norm": 4.782381057739258,
1983
+ "learning_rate": 4.9480018999951325e-05,
1984
+ "loss": 1.6125,
1985
+ "step": 6200
1986
+ },
1987
+ {
1988
+ "epoch": 0.010406333764692651,
1989
+ "eval_loss": 1.6434142589569092,
1990
+ "eval_masked_accuracy": 0.6638655662536621,
1991
+ "eval_runtime": 1.7489,
1992
+ "eval_samples_per_second": 5.718,
1993
+ "eval_steps_per_second": 2.287,
1994
+ "step": 6200
1995
+ },
1996
+ {
1997
+ "epoch": 0.01049025581118211,
1998
+ "grad_norm": 5.14832878112793,
1999
+ "learning_rate": 4.9475822897626854e-05,
2000
+ "loss": 1.6089,
2001
+ "step": 6250
2002
+ },
2003
+ {
2004
+ "epoch": 0.01049025581118211,
2005
+ "eval_loss": 1.239379644393921,
2006
+ "eval_masked_accuracy": 0.7427386045455933,
2007
+ "eval_runtime": 1.7532,
2008
+ "eval_samples_per_second": 5.704,
2009
+ "eval_steps_per_second": 2.282,
2010
+ "step": 6250
2011
+ },
2012
+ {
2013
+ "epoch": 0.010574177857671566,
2014
+ "grad_norm": 5.390649795532227,
2015
+ "learning_rate": 4.947162679530238e-05,
2016
+ "loss": 1.6357,
2017
+ "step": 6300
2018
+ },
2019
+ {
2020
+ "epoch": 0.010574177857671566,
2021
+ "eval_loss": 1.5129663944244385,
2022
+ "eval_masked_accuracy": 0.692307710647583,
2023
+ "eval_runtime": 1.7523,
2024
+ "eval_samples_per_second": 5.707,
2025
+ "eval_steps_per_second": 2.283,
2026
+ "step": 6300
2027
+ },
2028
+ {
2029
+ "epoch": 0.010658099904161023,
2030
+ "grad_norm": 4.3327412605285645,
2031
+ "learning_rate": 4.9467430692977904e-05,
2032
+ "loss": 1.5318,
2033
+ "step": 6350
2034
+ },
2035
+ {
2036
+ "epoch": 0.010658099904161023,
2037
+ "eval_loss": 1.7716737985610962,
2038
+ "eval_masked_accuracy": 0.6942148804664612,
2039
+ "eval_runtime": 1.7284,
2040
+ "eval_samples_per_second": 5.786,
2041
+ "eval_steps_per_second": 2.314,
2042
+ "step": 6350
2043
+ },
2044
+ {
2045
+ "epoch": 0.01074202195065048,
2046
+ "grad_norm": 5.145776271820068,
2047
+ "learning_rate": 4.946323459065343e-05,
2048
+ "loss": 1.6081,
2049
+ "step": 6400
2050
+ },
2051
+ {
2052
+ "epoch": 0.01074202195065048,
2053
+ "eval_loss": 1.6661970615386963,
2054
+ "eval_masked_accuracy": 0.6882591247558594,
2055
+ "eval_runtime": 1.7486,
2056
+ "eval_samples_per_second": 5.719,
2057
+ "eval_steps_per_second": 2.288,
2058
+ "step": 6400
2059
+ },
2060
+ {
2061
+ "epoch": 0.010825943997139936,
2062
+ "grad_norm": 5.037006855010986,
2063
+ "learning_rate": 4.945903848832896e-05,
2064
+ "loss": 1.5028,
2065
+ "step": 6450
2066
+ },
2067
+ {
2068
+ "epoch": 0.010825943997139936,
2069
+ "eval_loss": 1.4679136276245117,
2070
+ "eval_masked_accuracy": 0.714893639087677,
2071
+ "eval_runtime": 1.7514,
2072
+ "eval_samples_per_second": 5.71,
2073
+ "eval_steps_per_second": 2.284,
2074
+ "step": 6450
2075
+ },
2076
+ {
2077
+ "epoch": 0.010909866043629394,
2078
+ "grad_norm": 5.618253707885742,
2079
+ "learning_rate": 4.945484238600449e-05,
2080
+ "loss": 1.5477,
2081
+ "step": 6500
2082
+ },
2083
+ {
2084
+ "epoch": 0.010909866043629394,
2085
+ "eval_loss": 1.6666347980499268,
2086
+ "eval_masked_accuracy": 0.7094017267227173,
2087
+ "eval_runtime": 1.7486,
2088
+ "eval_samples_per_second": 5.719,
2089
+ "eval_steps_per_second": 2.288,
2090
+ "step": 6500
2091
+ },
2092
+ {
2093
+ "epoch": 0.01099378809011885,
2094
+ "grad_norm": 14.34435749053955,
2095
+ "learning_rate": 4.945064628368002e-05,
2096
+ "loss": 1.6291,
2097
+ "step": 6550
2098
+ },
2099
+ {
2100
+ "epoch": 0.01099378809011885,
2101
+ "eval_loss": 1.8381481170654297,
2102
+ "eval_masked_accuracy": 0.6547085046768188,
2103
+ "eval_runtime": 1.7548,
2104
+ "eval_samples_per_second": 5.699,
2105
+ "eval_steps_per_second": 2.279,
2106
+ "step": 6550
2107
+ },
2108
+ {
2109
+ "epoch": 0.011077710136608307,
2110
+ "grad_norm": 4.846654891967773,
2111
+ "learning_rate": 4.9446450181355546e-05,
2112
+ "loss": 1.6077,
2113
+ "step": 6600
2114
+ },
2115
+ {
2116
+ "epoch": 0.011077710136608307,
2117
+ "eval_loss": 1.5568077564239502,
2118
+ "eval_masked_accuracy": 0.6872428059577942,
2119
+ "eval_runtime": 1.7324,
2120
+ "eval_samples_per_second": 5.772,
2121
+ "eval_steps_per_second": 2.309,
2122
+ "step": 6600
2123
+ },
2124
+ {
2125
+ "epoch": 0.011161632183097764,
2126
+ "grad_norm": 5.304859161376953,
2127
+ "learning_rate": 4.944225407903107e-05,
2128
+ "loss": 1.5758,
2129
+ "step": 6650
2130
+ },
2131
+ {
2132
+ "epoch": 0.011161632183097764,
2133
+ "eval_loss": 1.3110054731369019,
2134
+ "eval_masked_accuracy": 0.7312775254249573,
2135
+ "eval_runtime": 1.7439,
2136
+ "eval_samples_per_second": 5.734,
2137
+ "eval_steps_per_second": 2.294,
2138
+ "step": 6650
2139
+ },
2140
+ {
2141
+ "epoch": 0.01124555422958722,
2142
+ "grad_norm": 6.187143802642822,
2143
+ "learning_rate": 4.9438057976706596e-05,
2144
+ "loss": 1.5817,
2145
+ "step": 6700
2146
+ },
2147
+ {
2148
+ "epoch": 0.01124555422958722,
2149
+ "eval_loss": 1.7989356517791748,
2150
+ "eval_masked_accuracy": 0.6666666865348816,
2151
+ "eval_runtime": 1.754,
2152
+ "eval_samples_per_second": 5.701,
2153
+ "eval_steps_per_second": 2.28,
2154
+ "step": 6700
2155
+ },
2156
+ {
2157
+ "epoch": 0.011329476276076679,
2158
+ "grad_norm": 5.595826148986816,
2159
+ "learning_rate": 4.9433861874382124e-05,
2160
+ "loss": 1.6367,
2161
+ "step": 6750
2162
+ },
2163
+ {
2164
+ "epoch": 0.011329476276076679,
2165
+ "eval_loss": 1.7425569295883179,
2166
+ "eval_masked_accuracy": 0.6583333611488342,
2167
+ "eval_runtime": 1.7467,
2168
+ "eval_samples_per_second": 5.725,
2169
+ "eval_steps_per_second": 2.29,
2170
+ "step": 6750
2171
+ },
2172
+ {
2173
+ "epoch": 0.011413398322566135,
2174
+ "grad_norm": 4.125125408172607,
2175
+ "learning_rate": 4.942966577205765e-05,
2176
+ "loss": 1.641,
2177
+ "step": 6800
2178
+ },
2179
+ {
2180
+ "epoch": 0.011413398322566135,
2181
+ "eval_loss": 1.728715181350708,
2182
+ "eval_masked_accuracy": 0.6652892827987671,
2183
+ "eval_runtime": 1.772,
2184
+ "eval_samples_per_second": 5.643,
2185
+ "eval_steps_per_second": 2.257,
2186
+ "step": 6800
2187
+ },
2188
+ {
2189
+ "epoch": 0.011497320369055592,
2190
+ "grad_norm": 6.3898844718933105,
2191
+ "learning_rate": 4.942546966973318e-05,
2192
+ "loss": 1.6574,
2193
+ "step": 6850
2194
+ },
2195
+ {
2196
+ "epoch": 0.011497320369055592,
2197
+ "eval_loss": 1.8261781930923462,
2198
+ "eval_masked_accuracy": 0.6752136945724487,
2199
+ "eval_runtime": 1.7446,
2200
+ "eval_samples_per_second": 5.732,
2201
+ "eval_steps_per_second": 2.293,
2202
+ "step": 6850
2203
+ },
2204
+ {
2205
+ "epoch": 0.011581242415545048,
2206
+ "grad_norm": 5.9191155433654785,
2207
+ "learning_rate": 4.942127356740871e-05,
2208
+ "loss": 1.5732,
2209
+ "step": 6900
2210
+ },
2211
+ {
2212
+ "epoch": 0.011581242415545048,
2213
+ "eval_loss": 1.2290430068969727,
2214
+ "eval_masked_accuracy": 0.7573221921920776,
2215
+ "eval_runtime": 1.7438,
2216
+ "eval_samples_per_second": 5.735,
2217
+ "eval_steps_per_second": 2.294,
2218
+ "step": 6900
2219
+ },
2220
+ {
2221
+ "epoch": 0.011665164462034505,
2222
+ "grad_norm": 5.910600185394287,
2223
+ "learning_rate": 4.941707746508423e-05,
2224
+ "loss": 1.5018,
2225
+ "step": 6950
2226
+ },
2227
+ {
2228
+ "epoch": 0.011665164462034505,
2229
+ "eval_loss": 1.3011202812194824,
2230
+ "eval_masked_accuracy": 0.746835470199585,
2231
+ "eval_runtime": 1.739,
2232
+ "eval_samples_per_second": 5.751,
2233
+ "eval_steps_per_second": 2.3,
2234
+ "step": 6950
2235
+ },
2236
+ {
2237
+ "epoch": 0.011749086508523963,
2238
+ "grad_norm": 7.273187637329102,
2239
+ "learning_rate": 4.941288136275976e-05,
2240
+ "loss": 1.6083,
2241
+ "step": 7000
2242
+ },
2243
+ {
2244
+ "epoch": 0.011749086508523963,
2245
+ "eval_loss": 1.7945482730865479,
2246
+ "eval_masked_accuracy": 0.6719367504119873,
2247
+ "eval_runtime": 1.7495,
2248
+ "eval_samples_per_second": 5.716,
2249
+ "eval_steps_per_second": 2.286,
2250
+ "step": 7000
2251
+ },
2252
+ {
2253
+ "epoch": 0.01183300855501342,
2254
+ "grad_norm": 5.980038642883301,
2255
+ "learning_rate": 4.940868526043529e-05,
2256
+ "loss": 1.7157,
2257
+ "step": 7050
2258
+ },
2259
+ {
2260
+ "epoch": 0.01183300855501342,
2261
+ "eval_loss": 1.6633656024932861,
2262
+ "eval_masked_accuracy": 0.6859503984451294,
2263
+ "eval_runtime": 1.7603,
2264
+ "eval_samples_per_second": 5.681,
2265
+ "eval_steps_per_second": 2.272,
2266
+ "step": 7050
2267
+ },
2268
+ {
2269
+ "epoch": 0.011916930601502876,
2270
+ "grad_norm": 4.222002029418945,
2271
+ "learning_rate": 4.9404489158110817e-05,
2272
+ "loss": 1.4124,
2273
+ "step": 7100
2274
+ },
2275
+ {
2276
+ "epoch": 0.011916930601502876,
2277
+ "eval_loss": 1.7207615375518799,
2278
+ "eval_masked_accuracy": 0.6793248653411865,
2279
+ "eval_runtime": 1.753,
2280
+ "eval_samples_per_second": 5.704,
2281
+ "eval_steps_per_second": 2.282,
2282
+ "step": 7100
2283
+ },
2284
+ {
2285
+ "epoch": 0.012000852647992333,
2286
+ "grad_norm": 8.79937744140625,
2287
+ "learning_rate": 4.9400293055786345e-05,
2288
+ "loss": 1.5698,
2289
+ "step": 7150
2290
+ },
2291
+ {
2292
+ "epoch": 0.012000852647992333,
2293
+ "eval_loss": 1.5078874826431274,
2294
+ "eval_masked_accuracy": 0.7276119589805603,
2295
+ "eval_runtime": 1.866,
2296
+ "eval_samples_per_second": 5.359,
2297
+ "eval_steps_per_second": 2.144,
2298
+ "step": 7150
2299
+ },
2300
+ {
2301
+ "epoch": 0.01208477469448179,
2302
+ "grad_norm": 6.331279754638672,
2303
+ "learning_rate": 4.939609695346187e-05,
2304
+ "loss": 1.5354,
2305
+ "step": 7200
2306
+ },
2307
+ {
2308
+ "epoch": 0.01208477469448179,
2309
+ "eval_loss": 1.3983685970306396,
2310
+ "eval_masked_accuracy": 0.7590909004211426,
2311
+ "eval_runtime": 1.7632,
2312
+ "eval_samples_per_second": 5.672,
2313
+ "eval_steps_per_second": 2.269,
2314
+ "step": 7200
2315
+ },
2316
+ {
2317
+ "epoch": 0.012168696740971246,
2318
+ "grad_norm": 4.12935733795166,
2319
+ "learning_rate": 4.9391900851137395e-05,
2320
+ "loss": 1.4778,
2321
+ "step": 7250
2322
+ },
2323
+ {
2324
+ "epoch": 0.012168696740971246,
2325
+ "eval_loss": 1.7603422403335571,
2326
+ "eval_masked_accuracy": 0.686956524848938,
2327
+ "eval_runtime": 1.7504,
2328
+ "eval_samples_per_second": 5.713,
2329
+ "eval_steps_per_second": 2.285,
2330
+ "step": 7250
2331
+ },
2332
+ {
2333
+ "epoch": 0.012252618787460704,
2334
+ "grad_norm": 5.025778293609619,
2335
+ "learning_rate": 4.9387704748812923e-05,
2336
+ "loss": 1.5175,
2337
+ "step": 7300
2338
+ },
2339
+ {
2340
+ "epoch": 0.012252618787460704,
2341
+ "eval_loss": 1.7313247919082642,
2342
+ "eval_masked_accuracy": 0.6872428059577942,
2343
+ "eval_runtime": 1.745,
2344
+ "eval_samples_per_second": 5.731,
2345
+ "eval_steps_per_second": 2.292,
2346
+ "step": 7300
2347
+ },
2348
+ {
2349
+ "epoch": 0.01233654083395016,
2350
+ "grad_norm": 9.704473495483398,
2351
+ "learning_rate": 4.938350864648845e-05,
2352
+ "loss": 1.4634,
2353
+ "step": 7350
2354
+ },
2355
+ {
2356
+ "epoch": 0.01233654083395016,
2357
+ "eval_loss": 1.271333932876587,
2358
+ "eval_masked_accuracy": 0.7397260069847107,
2359
+ "eval_runtime": 1.7484,
2360
+ "eval_samples_per_second": 5.72,
2361
+ "eval_steps_per_second": 2.288,
2362
+ "step": 7350
2363
+ },
2364
+ {
2365
+ "epoch": 0.012420462880439617,
2366
+ "grad_norm": 6.080599308013916,
2367
+ "learning_rate": 4.937931254416398e-05,
2368
+ "loss": 1.5937,
2369
+ "step": 7400
2370
+ },
2371
+ {
2372
+ "epoch": 0.012420462880439617,
2373
+ "eval_loss": 1.4850938320159912,
2374
+ "eval_masked_accuracy": 0.7280701994895935,
2375
+ "eval_runtime": 1.7517,
2376
+ "eval_samples_per_second": 5.709,
2377
+ "eval_steps_per_second": 2.284,
2378
+ "step": 7400
2379
+ },
2380
+ {
2381
+ "epoch": 0.012504384926929074,
2382
+ "grad_norm": 3.824946880340576,
2383
+ "learning_rate": 4.937511644183951e-05,
2384
+ "loss": 1.6026,
2385
+ "step": 7450
2386
+ },
2387
+ {
2388
+ "epoch": 0.012504384926929074,
2389
+ "eval_loss": 1.5267841815948486,
2390
+ "eval_masked_accuracy": 0.7058823704719543,
2391
+ "eval_runtime": 1.7438,
2392
+ "eval_samples_per_second": 5.734,
2393
+ "eval_steps_per_second": 2.294,
2394
+ "step": 7450
2395
+ },
2396
+ {
2397
+ "epoch": 0.01258830697341853,
2398
+ "grad_norm": 4.5395989418029785,
2399
+ "learning_rate": 4.937092033951503e-05,
2400
+ "loss": 1.4575,
2401
+ "step": 7500
2402
+ },
2403
+ {
2404
+ "epoch": 0.01258830697341853,
2405
+ "eval_loss": 1.4801056385040283,
2406
+ "eval_masked_accuracy": 0.680672287940979,
2407
+ "eval_runtime": 1.7409,
2408
+ "eval_samples_per_second": 5.744,
2409
+ "eval_steps_per_second": 2.298,
2410
+ "step": 7500
2411
+ },
2412
+ {
2413
+ "epoch": 0.012672229019907989,
2414
+ "grad_norm": 6.853204250335693,
2415
+ "learning_rate": 4.936672423719056e-05,
2416
+ "loss": 1.4224,
2417
+ "step": 7550
2418
+ },
2419
+ {
2420
+ "epoch": 0.012672229019907989,
2421
+ "eval_loss": 1.6892282962799072,
2422
+ "eval_masked_accuracy": 0.6551724076271057,
2423
+ "eval_runtime": 1.7414,
2424
+ "eval_samples_per_second": 5.742,
2425
+ "eval_steps_per_second": 2.297,
2426
+ "step": 7550
2427
+ },
2428
+ {
2429
+ "epoch": 0.012756151066397445,
2430
+ "grad_norm": 5.53077507019043,
2431
+ "learning_rate": 4.936252813486609e-05,
2432
+ "loss": 1.6706,
2433
+ "step": 7600
2434
+ },
2435
+ {
2436
+ "epoch": 0.012756151066397445,
2437
+ "eval_loss": 1.4235472679138184,
2438
+ "eval_masked_accuracy": 0.7426160573959351,
2439
+ "eval_runtime": 1.8082,
2440
+ "eval_samples_per_second": 5.53,
2441
+ "eval_steps_per_second": 2.212,
2442
+ "step": 7600
2443
+ },
2444
+ {
2445
+ "epoch": 0.012840073112886902,
2446
+ "grad_norm": 4.5907087326049805,
2447
+ "learning_rate": 4.9358332032541616e-05,
2448
+ "loss": 1.6674,
2449
+ "step": 7650
2450
+ },
2451
+ {
2452
+ "epoch": 0.012840073112886902,
2453
+ "eval_loss": 1.4942524433135986,
2454
+ "eval_masked_accuracy": 0.7172995805740356,
2455
+ "eval_runtime": 1.7449,
2456
+ "eval_samples_per_second": 5.731,
2457
+ "eval_steps_per_second": 2.292,
2458
+ "step": 7650
2459
+ },
2460
+ {
2461
+ "epoch": 0.012923995159376358,
2462
+ "grad_norm": 8.004353523254395,
2463
+ "learning_rate": 4.9354135930217144e-05,
2464
+ "loss": 1.4294,
2465
+ "step": 7700
2466
+ },
2467
+ {
2468
+ "epoch": 0.012923995159376358,
2469
+ "eval_loss": 1.7548024654388428,
2470
+ "eval_masked_accuracy": 0.6547619104385376,
2471
+ "eval_runtime": 1.7767,
2472
+ "eval_samples_per_second": 5.628,
2473
+ "eval_steps_per_second": 2.251,
2474
+ "step": 7700
2475
+ },
2476
+ {
2477
+ "epoch": 0.013007917205865815,
2478
+ "grad_norm": 6.963031768798828,
2479
+ "learning_rate": 4.934993982789267e-05,
2480
+ "loss": 1.5078,
2481
+ "step": 7750
2482
+ },
2483
+ {
2484
+ "epoch": 0.013007917205865815,
2485
+ "eval_loss": 1.4269187450408936,
2486
+ "eval_masked_accuracy": 0.7027027010917664,
2487
+ "eval_runtime": 1.7471,
2488
+ "eval_samples_per_second": 5.724,
2489
+ "eval_steps_per_second": 2.29,
2490
+ "step": 7750
2491
+ },
2492
+ {
2493
+ "epoch": 0.013091839252355273,
2494
+ "grad_norm": 6.4043288230896,
2495
+ "learning_rate": 4.9345743725568194e-05,
2496
+ "loss": 1.604,
2497
+ "step": 7800
2498
+ },
2499
+ {
2500
+ "epoch": 0.013091839252355273,
2501
+ "eval_loss": 1.4502145051956177,
2502
+ "eval_masked_accuracy": 0.7172995805740356,
2503
+ "eval_runtime": 1.748,
2504
+ "eval_samples_per_second": 5.721,
2505
+ "eval_steps_per_second": 2.288,
2506
+ "step": 7800
2507
+ },
2508
+ {
2509
+ "epoch": 0.01317576129884473,
2510
+ "grad_norm": 5.293691158294678,
2511
+ "learning_rate": 4.934154762324372e-05,
2512
+ "loss": 1.6301,
2513
+ "step": 7850
2514
+ },
2515
+ {
2516
+ "epoch": 0.01317576129884473,
2517
+ "eval_loss": 1.3547624349594116,
2518
+ "eval_masked_accuracy": 0.7759336233139038,
2519
+ "eval_runtime": 1.7437,
2520
+ "eval_samples_per_second": 5.735,
2521
+ "eval_steps_per_second": 2.294,
2522
+ "step": 7850
2523
+ },
2524
+ {
2525
+ "epoch": 0.013259683345334186,
2526
+ "grad_norm": 7.364100933074951,
2527
+ "learning_rate": 4.933735152091925e-05,
2528
+ "loss": 1.5163,
2529
+ "step": 7900
2530
+ },
2531
+ {
2532
+ "epoch": 0.013259683345334186,
2533
+ "eval_loss": 1.6089417934417725,
2534
+ "eval_masked_accuracy": 0.6610878705978394,
2535
+ "eval_runtime": 1.753,
2536
+ "eval_samples_per_second": 5.704,
2537
+ "eval_steps_per_second": 2.282,
2538
+ "step": 7900
2539
+ },
2540
+ {
2541
+ "epoch": 0.013343605391823643,
2542
+ "grad_norm": 7.704033851623535,
2543
+ "learning_rate": 4.933315541859478e-05,
2544
+ "loss": 1.6564,
2545
+ "step": 7950
2546
+ },
2547
+ {
2548
+ "epoch": 0.013343605391823643,
2549
+ "eval_loss": 1.4759953022003174,
2550
+ "eval_masked_accuracy": 0.6958333253860474,
2551
+ "eval_runtime": 1.7614,
2552
+ "eval_samples_per_second": 5.677,
2553
+ "eval_steps_per_second": 2.271,
2554
+ "step": 7950
2555
+ },
2556
+ {
2557
+ "epoch": 0.0134275274383131,
2558
+ "grad_norm": 5.562460899353027,
2559
+ "learning_rate": 4.932895931627031e-05,
2560
+ "loss": 1.5703,
2561
+ "step": 8000
2562
+ },
2563
+ {
2564
+ "epoch": 0.0134275274383131,
2565
+ "eval_loss": 1.735896348953247,
2566
+ "eval_masked_accuracy": 0.6875,
2567
+ "eval_runtime": 1.7493,
2568
+ "eval_samples_per_second": 5.717,
2569
+ "eval_steps_per_second": 2.287,
2570
+ "step": 8000
2571
+ },
2572
+ {
2573
+ "epoch": 0.013511449484802556,
2574
+ "grad_norm": 8.801225662231445,
2575
+ "learning_rate": 4.9324763213945836e-05,
2576
+ "loss": 1.5328,
2577
+ "step": 8050
2578
+ },
2579
+ {
2580
+ "epoch": 0.013511449484802556,
2581
+ "eval_loss": 1.2792503833770752,
2582
+ "eval_masked_accuracy": 0.7292576432228088,
2583
+ "eval_runtime": 1.7802,
2584
+ "eval_samples_per_second": 5.617,
2585
+ "eval_steps_per_second": 2.247,
2586
+ "step": 8050
2587
+ },
2588
+ {
2589
+ "epoch": 0.013595371531292014,
2590
+ "grad_norm": 5.510076999664307,
2591
+ "learning_rate": 4.932056711162136e-05,
2592
+ "loss": 1.5086,
2593
+ "step": 8100
2594
+ },
2595
+ {
2596
+ "epoch": 0.013595371531292014,
2597
+ "eval_loss": 1.811342477798462,
2598
+ "eval_masked_accuracy": 0.6508620977401733,
2599
+ "eval_runtime": 1.7772,
2600
+ "eval_samples_per_second": 5.627,
2601
+ "eval_steps_per_second": 2.251,
2602
+ "step": 8100
2603
+ },
2604
+ {
2605
+ "epoch": 0.01367929357778147,
2606
+ "grad_norm": 4.370019912719727,
2607
+ "learning_rate": 4.9316371009296886e-05,
2608
+ "loss": 1.5992,
2609
+ "step": 8150
2610
+ },
2611
+ {
2612
+ "epoch": 0.01367929357778147,
2613
+ "eval_loss": 1.7015224695205688,
2614
+ "eval_masked_accuracy": 0.6945606470108032,
2615
+ "eval_runtime": 1.7399,
2616
+ "eval_samples_per_second": 5.747,
2617
+ "eval_steps_per_second": 2.299,
2618
+ "step": 8150
2619
+ },
2620
+ {
2621
+ "epoch": 0.013763215624270927,
2622
+ "grad_norm": 5.960280895233154,
2623
+ "learning_rate": 4.9312174906972415e-05,
2624
+ "loss": 1.6392,
2625
+ "step": 8200
2626
+ },
2627
+ {
2628
+ "epoch": 0.013763215624270927,
2629
+ "eval_loss": 1.4644631147384644,
2630
+ "eval_masked_accuracy": 0.7004830837249756,
2631
+ "eval_runtime": 1.7493,
2632
+ "eval_samples_per_second": 5.717,
2633
+ "eval_steps_per_second": 2.287,
2634
+ "step": 8200
2635
+ },
2636
+ {
2637
+ "epoch": 0.013847137670760384,
2638
+ "grad_norm": 5.401033878326416,
2639
+ "learning_rate": 4.930797880464794e-05,
2640
+ "loss": 1.6492,
2641
+ "step": 8250
2642
+ },
2643
+ {
2644
+ "epoch": 0.013847137670760384,
2645
+ "eval_loss": 1.5244245529174805,
2646
+ "eval_masked_accuracy": 0.688034176826477,
2647
+ "eval_runtime": 1.7597,
2648
+ "eval_samples_per_second": 5.683,
2649
+ "eval_steps_per_second": 2.273,
2650
+ "step": 8250
2651
+ },
2652
+ {
2653
+ "epoch": 0.01393105971724984,
2654
+ "grad_norm": 7.356916427612305,
2655
+ "learning_rate": 4.930378270232347e-05,
2656
+ "loss": 1.5673,
2657
+ "step": 8300
2658
+ },
2659
+ {
2660
+ "epoch": 0.01393105971724984,
2661
+ "eval_loss": 1.4024368524551392,
2662
+ "eval_masked_accuracy": 0.7016806602478027,
2663
+ "eval_runtime": 1.7463,
2664
+ "eval_samples_per_second": 5.726,
2665
+ "eval_steps_per_second": 2.291,
2666
+ "step": 8300
2667
+ },
2668
+ {
2669
+ "epoch": 0.014014981763739299,
2670
+ "grad_norm": 5.370472431182861,
2671
+ "learning_rate": 4.929958659999899e-05,
2672
+ "loss": 1.5267,
2673
+ "step": 8350
2674
+ },
2675
+ {
2676
+ "epoch": 0.014014981763739299,
2677
+ "eval_loss": 1.7430174350738525,
2678
+ "eval_masked_accuracy": 0.6653061509132385,
2679
+ "eval_runtime": 1.7353,
2680
+ "eval_samples_per_second": 5.763,
2681
+ "eval_steps_per_second": 2.305,
2682
+ "step": 8350
2683
+ },
2684
+ {
2685
+ "epoch": 0.014098903810228755,
2686
+ "grad_norm": 6.4656500816345215,
2687
+ "learning_rate": 4.929539049767452e-05,
2688
+ "loss": 1.5918,
2689
+ "step": 8400
2690
+ },
2691
+ {
2692
+ "epoch": 0.014098903810228755,
2693
+ "eval_loss": 1.691054344177246,
2694
+ "eval_masked_accuracy": 0.6849315166473389,
2695
+ "eval_runtime": 1.746,
2696
+ "eval_samples_per_second": 5.727,
2697
+ "eval_steps_per_second": 2.291,
2698
+ "step": 8400
2699
+ },
2700
+ {
2701
+ "epoch": 0.014182825856718212,
2702
+ "grad_norm": 5.481358051300049,
2703
+ "learning_rate": 4.929119439535005e-05,
2704
+ "loss": 1.5156,
2705
+ "step": 8450
2706
+ },
2707
+ {
2708
+ "epoch": 0.014182825856718212,
2709
+ "eval_loss": 1.6469824314117432,
2710
+ "eval_masked_accuracy": 0.6516393423080444,
2711
+ "eval_runtime": 1.7423,
2712
+ "eval_samples_per_second": 5.74,
2713
+ "eval_steps_per_second": 2.296,
2714
+ "step": 8450
2715
+ },
2716
+ {
2717
+ "epoch": 0.014266747903207668,
2718
+ "grad_norm": 4.755044937133789,
2719
+ "learning_rate": 4.928699829302558e-05,
2720
+ "loss": 1.5223,
2721
+ "step": 8500
2722
+ },
2723
+ {
2724
+ "epoch": 0.014266747903207668,
2725
+ "eval_loss": 1.667824387550354,
2726
+ "eval_masked_accuracy": 0.7068965435028076,
2727
+ "eval_runtime": 1.7979,
2728
+ "eval_samples_per_second": 5.562,
2729
+ "eval_steps_per_second": 2.225,
2730
+ "step": 8500
2731
+ },
2732
+ {
2733
+ "epoch": 0.014350669949697125,
2734
+ "grad_norm": 6.595943450927734,
2735
+ "learning_rate": 4.928280219070111e-05,
2736
+ "loss": 1.4699,
2737
+ "step": 8550
2738
+ },
2739
+ {
2740
+ "epoch": 0.014350669949697125,
2741
+ "eval_loss": 1.2367641925811768,
2742
+ "eval_masked_accuracy": 0.7447698712348938,
2743
+ "eval_runtime": 1.7387,
2744
+ "eval_samples_per_second": 5.752,
2745
+ "eval_steps_per_second": 2.301,
2746
+ "step": 8550
2747
+ },
2748
+ {
2749
+ "epoch": 0.014434591996186583,
2750
+ "grad_norm": 3.9210710525512695,
2751
+ "learning_rate": 4.9278606088376635e-05,
2752
+ "loss": 1.5695,
2753
+ "step": 8600
2754
+ },
2755
+ {
2756
+ "epoch": 0.014434591996186583,
2757
+ "eval_loss": 1.3033006191253662,
2758
+ "eval_masked_accuracy": 0.693965494632721,
2759
+ "eval_runtime": 1.7554,
2760
+ "eval_samples_per_second": 5.697,
2761
+ "eval_steps_per_second": 2.279,
2762
+ "step": 8600
2763
+ },
2764
+ {
2765
+ "epoch": 0.01451851404267604,
2766
+ "grad_norm": 4.682461261749268,
2767
+ "learning_rate": 4.927440998605216e-05,
2768
+ "loss": 1.5371,
2769
+ "step": 8650
2770
+ },
2771
+ {
2772
+ "epoch": 0.01451851404267604,
2773
+ "eval_loss": 1.727216124534607,
2774
+ "eval_masked_accuracy": 0.6639004349708557,
2775
+ "eval_runtime": 1.7387,
2776
+ "eval_samples_per_second": 5.751,
2777
+ "eval_steps_per_second": 2.301,
2778
+ "step": 8650
2779
+ },
2780
+ {
2781
+ "epoch": 0.014602436089165496,
2782
+ "grad_norm": 4.478100776672363,
2783
+ "learning_rate": 4.9270213883727685e-05,
2784
+ "loss": 1.5679,
2785
+ "step": 8700
2786
+ },
2787
+ {
2788
+ "epoch": 0.014602436089165496,
2789
+ "eval_loss": 1.4694969654083252,
2790
+ "eval_masked_accuracy": 0.7364016771316528,
2791
+ "eval_runtime": 1.7474,
2792
+ "eval_samples_per_second": 5.723,
2793
+ "eval_steps_per_second": 2.289,
2794
+ "step": 8700
2795
+ },
2796
+ {
2797
+ "epoch": 0.014686358135654953,
2798
+ "grad_norm": 8.149710655212402,
2799
+ "learning_rate": 4.9266017781403214e-05,
2800
+ "loss": 1.4814,
2801
+ "step": 8750
2802
+ },
2803
+ {
2804
+ "epoch": 0.014686358135654953,
2805
+ "eval_loss": 1.9258610010147095,
2806
+ "eval_masked_accuracy": 0.6228070259094238,
2807
+ "eval_runtime": 1.7513,
2808
+ "eval_samples_per_second": 5.71,
2809
+ "eval_steps_per_second": 2.284,
2810
+ "step": 8750
2811
+ },
2812
+ {
2813
+ "epoch": 0.01477028018214441,
2814
+ "grad_norm": 4.727016925811768,
2815
+ "learning_rate": 4.926182167907874e-05,
2816
+ "loss": 1.609,
2817
+ "step": 8800
2818
+ },
2819
+ {
2820
+ "epoch": 0.01477028018214441,
2821
+ "eval_loss": 1.6111774444580078,
2822
+ "eval_masked_accuracy": 0.6590038537979126,
2823
+ "eval_runtime": 1.7579,
2824
+ "eval_samples_per_second": 5.689,
2825
+ "eval_steps_per_second": 2.275,
2826
+ "step": 8800
2827
+ },
2828
+ {
2829
+ "epoch": 0.014854202228633867,
2830
+ "grad_norm": 5.348945140838623,
2831
+ "learning_rate": 4.925762557675427e-05,
2832
+ "loss": 1.5557,
2833
+ "step": 8850
2834
+ },
2835
+ {
2836
+ "epoch": 0.014854202228633867,
2837
+ "eval_loss": 1.3535053730010986,
2838
+ "eval_masked_accuracy": 0.7245762944221497,
2839
+ "eval_runtime": 1.8639,
2840
+ "eval_samples_per_second": 5.365,
2841
+ "eval_steps_per_second": 2.146,
2842
+ "step": 8850
2843
+ },
2844
+ {
2845
+ "epoch": 0.014938124275123324,
2846
+ "grad_norm": 6.573589324951172,
2847
+ "learning_rate": 4.92534294744298e-05,
2848
+ "loss": 1.6389,
2849
+ "step": 8900
2850
+ },
2851
+ {
2852
+ "epoch": 0.014938124275123324,
2853
+ "eval_loss": 1.8509418964385986,
2854
+ "eval_masked_accuracy": 0.7085201740264893,
2855
+ "eval_runtime": 1.7536,
2856
+ "eval_samples_per_second": 5.703,
2857
+ "eval_steps_per_second": 2.281,
2858
+ "step": 8900
2859
+ },
2860
+ {
2861
+ "epoch": 0.01502204632161278,
2862
+ "grad_norm": 7.373574256896973,
2863
+ "learning_rate": 4.924923337210532e-05,
2864
+ "loss": 1.4773,
2865
+ "step": 8950
2866
+ },
2867
+ {
2868
+ "epoch": 0.01502204632161278,
2869
+ "eval_loss": 1.7772554159164429,
2870
+ "eval_masked_accuracy": 0.6640625,
2871
+ "eval_runtime": 1.7655,
2872
+ "eval_samples_per_second": 5.664,
2873
+ "eval_steps_per_second": 2.266,
2874
+ "step": 8950
2875
+ },
2876
+ {
2877
+ "epoch": 0.015105968368102237,
2878
+ "grad_norm": 5.861003875732422,
2879
+ "learning_rate": 4.924503726978085e-05,
2880
+ "loss": 1.3842,
2881
+ "step": 9000
2882
+ },
2883
+ {
2884
+ "epoch": 0.015105968368102237,
2885
+ "eval_loss": 1.6182334423065186,
2886
+ "eval_masked_accuracy": 0.7183098793029785,
2887
+ "eval_runtime": 1.7386,
2888
+ "eval_samples_per_second": 5.752,
2889
+ "eval_steps_per_second": 2.301,
2890
+ "step": 9000
2891
+ },
2892
+ {
2893
+ "epoch": 0.015189890414591694,
2894
+ "grad_norm": 5.086306571960449,
2895
+ "learning_rate": 4.924084116745638e-05,
2896
+ "loss": 1.6445,
2897
+ "step": 9050
2898
+ },
2899
+ {
2900
+ "epoch": 0.015189890414591694,
2901
+ "eval_loss": 1.3457679748535156,
2902
+ "eval_masked_accuracy": 0.752293586730957,
2903
+ "eval_runtime": 1.7595,
2904
+ "eval_samples_per_second": 5.684,
2905
+ "eval_steps_per_second": 2.273,
2906
+ "step": 9050
2907
+ },
2908
+ {
2909
+ "epoch": 0.01527381246108115,
2910
+ "grad_norm": 7.099021911621094,
2911
+ "learning_rate": 4.9236645065131906e-05,
2912
+ "loss": 1.5536,
2913
+ "step": 9100
2914
+ },
2915
+ {
2916
+ "epoch": 0.01527381246108115,
2917
+ "eval_loss": 1.8317623138427734,
2918
+ "eval_masked_accuracy": 0.6588628888130188,
2919
+ "eval_runtime": 1.8424,
2920
+ "eval_samples_per_second": 5.428,
2921
+ "eval_steps_per_second": 2.171,
2922
+ "step": 9100
2923
+ },
2924
+ {
2925
+ "epoch": 0.015357734507570608,
2926
+ "grad_norm": 6.620283126831055,
2927
+ "learning_rate": 4.9232448962807434e-05,
2928
+ "loss": 1.5151,
2929
+ "step": 9150
2930
+ },
2931
+ {
2932
+ "epoch": 0.015357734507570608,
2933
+ "eval_loss": 1.4230843782424927,
2934
+ "eval_masked_accuracy": 0.700421929359436,
2935
+ "eval_runtime": 1.7611,
2936
+ "eval_samples_per_second": 5.678,
2937
+ "eval_steps_per_second": 2.271,
2938
+ "step": 9150
2939
+ },
2940
+ {
2941
+ "epoch": 0.015441656554060065,
2942
+ "grad_norm": 7.231357097625732,
2943
+ "learning_rate": 4.922825286048296e-05,
2944
+ "loss": 1.6078,
2945
+ "step": 9200
2946
+ },
2947
+ {
2948
+ "epoch": 0.015441656554060065,
2949
+ "eval_loss": 1.7547998428344727,
2950
+ "eval_masked_accuracy": 0.6745283007621765,
2951
+ "eval_runtime": 1.8328,
2952
+ "eval_samples_per_second": 5.456,
2953
+ "eval_steps_per_second": 2.182,
2954
+ "step": 9200
2955
+ },
2956
+ {
2957
+ "epoch": 0.015525578600549522,
2958
+ "grad_norm": 4.755532264709473,
2959
+ "learning_rate": 4.9224140680204975e-05,
2960
+ "loss": 1.5938,
2961
+ "step": 9250
2962
+ },
2963
+ {
2964
+ "epoch": 0.015525578600549522,
2965
+ "eval_loss": 1.3346257209777832,
2966
+ "eval_masked_accuracy": 0.7244444489479065,
2967
+ "eval_runtime": 1.7553,
2968
+ "eval_samples_per_second": 5.697,
2969
+ "eval_steps_per_second": 2.279,
2970
+ "step": 9250
2971
+ },
2972
+ {
2973
+ "epoch": 0.015609500647038978,
2974
+ "grad_norm": 5.728196620941162,
2975
+ "learning_rate": 4.92199445778805e-05,
2976
+ "loss": 1.5542,
2977
+ "step": 9300
2978
+ },
2979
+ {
2980
+ "epoch": 0.015609500647038978,
2981
+ "eval_loss": 1.6833394765853882,
2982
+ "eval_masked_accuracy": 0.6654929518699646,
2983
+ "eval_runtime": 1.7516,
2984
+ "eval_samples_per_second": 5.709,
2985
+ "eval_steps_per_second": 2.284,
2986
+ "step": 9300
2987
+ },
2988
+ {
2989
+ "epoch": 0.015693422693528435,
2990
+ "grad_norm": 5.66224479675293,
2991
+ "learning_rate": 4.921574847555603e-05,
2992
+ "loss": 1.6099,
2993
+ "step": 9350
2994
+ },
2995
+ {
2996
+ "epoch": 0.015693422693528435,
2997
+ "eval_loss": 1.442452311515808,
2998
+ "eval_masked_accuracy": 0.6905829310417175,
2999
+ "eval_runtime": 1.7553,
3000
+ "eval_samples_per_second": 5.697,
3001
+ "eval_steps_per_second": 2.279,
3002
+ "step": 9350
3003
+ },
3004
+ {
3005
+ "epoch": 0.015777344740017893,
3006
+ "grad_norm": 6.560795307159424,
3007
+ "learning_rate": 4.921155237323155e-05,
3008
+ "loss": 1.4188,
3009
+ "step": 9400
3010
+ },
3011
+ {
3012
+ "epoch": 0.015777344740017893,
3013
+ "eval_loss": 1.539738416671753,
3014
+ "eval_masked_accuracy": 0.68359375,
3015
+ "eval_runtime": 1.7406,
3016
+ "eval_samples_per_second": 5.745,
3017
+ "eval_steps_per_second": 2.298,
3018
+ "step": 9400
3019
+ },
3020
+ {
3021
+ "epoch": 0.015861266786507348,
3022
+ "grad_norm": 4.9847025871276855,
3023
+ "learning_rate": 4.920735627090708e-05,
3024
+ "loss": 1.6344,
3025
+ "step": 9450
3026
+ },
3027
+ {
3028
+ "epoch": 0.015861266786507348,
3029
+ "eval_loss": 1.244769811630249,
3030
+ "eval_masked_accuracy": 0.7078189253807068,
3031
+ "eval_runtime": 1.77,
3032
+ "eval_samples_per_second": 5.65,
3033
+ "eval_steps_per_second": 2.26,
3034
+ "step": 9450
3035
+ },
3036
+ {
3037
+ "epoch": 0.015945188832996806,
3038
+ "grad_norm": 6.173788070678711,
3039
+ "learning_rate": 4.920316016858261e-05,
3040
+ "loss": 1.6249,
3041
+ "step": 9500
3042
+ },
3043
+ {
3044
+ "epoch": 0.015945188832996806,
3045
+ "eval_loss": 2.0483577251434326,
3046
+ "eval_masked_accuracy": 0.607594907283783,
3047
+ "eval_runtime": 1.7538,
3048
+ "eval_samples_per_second": 5.702,
3049
+ "eval_steps_per_second": 2.281,
3050
+ "step": 9500
3051
+ },
3052
+ {
3053
+ "epoch": 0.016029110879486264,
3054
+ "grad_norm": 4.4076828956604,
3055
+ "learning_rate": 4.919896406625814e-05,
3056
+ "loss": 1.505,
3057
+ "step": 9550
3058
+ },
3059
+ {
3060
+ "epoch": 0.016029110879486264,
3061
+ "eval_loss": 1.7403160333633423,
3062
+ "eval_masked_accuracy": 0.7048457860946655,
3063
+ "eval_runtime": 1.7491,
3064
+ "eval_samples_per_second": 5.717,
3065
+ "eval_steps_per_second": 2.287,
3066
+ "step": 9550
3067
+ },
3068
+ {
3069
+ "epoch": 0.01611303292597572,
3070
+ "grad_norm": 6.358312129974365,
3071
+ "learning_rate": 4.919476796393366e-05,
3072
+ "loss": 1.655,
3073
+ "step": 9600
3074
+ },
3075
+ {
3076
+ "epoch": 0.01611303292597572,
3077
+ "eval_loss": 1.8444688320159912,
3078
+ "eval_masked_accuracy": 0.6808510422706604,
3079
+ "eval_runtime": 1.7573,
3080
+ "eval_samples_per_second": 5.691,
3081
+ "eval_steps_per_second": 2.276,
3082
+ "step": 9600
3083
+ },
3084
+ {
3085
+ "epoch": 0.016196954972465177,
3086
+ "grad_norm": 6.645698547363281,
3087
+ "learning_rate": 4.919057186160919e-05,
3088
+ "loss": 1.5926,
3089
+ "step": 9650
3090
+ },
3091
+ {
3092
+ "epoch": 0.016196954972465177,
3093
+ "eval_loss": 1.6228317022323608,
3094
+ "eval_masked_accuracy": 0.65625,
3095
+ "eval_runtime": 1.8422,
3096
+ "eval_samples_per_second": 5.428,
3097
+ "eval_steps_per_second": 2.171,
3098
+ "step": 9650
3099
+ },
3100
+ {
3101
+ "epoch": 0.016280877018954632,
3102
+ "grad_norm": 5.672697067260742,
3103
+ "learning_rate": 4.918637575928472e-05,
3104
+ "loss": 1.4762,
3105
+ "step": 9700
3106
+ },
3107
+ {
3108
+ "epoch": 0.016280877018954632,
3109
+ "eval_loss": 1.5051512718200684,
3110
+ "eval_masked_accuracy": 0.6943231225013733,
3111
+ "eval_runtime": 1.7515,
3112
+ "eval_samples_per_second": 5.709,
3113
+ "eval_steps_per_second": 2.284,
3114
+ "step": 9700
3115
+ },
3116
+ {
3117
+ "epoch": 0.01636479906544409,
3118
+ "grad_norm": 5.369190216064453,
3119
+ "learning_rate": 4.9182179656960245e-05,
3120
+ "loss": 1.5021,
3121
+ "step": 9750
3122
+ },
3123
+ {
3124
+ "epoch": 0.01636479906544409,
3125
+ "eval_loss": 1.7301708459854126,
3126
+ "eval_masked_accuracy": 0.6593886613845825,
3127
+ "eval_runtime": 1.7374,
3128
+ "eval_samples_per_second": 5.756,
3129
+ "eval_steps_per_second": 2.302,
3130
+ "step": 9750
3131
+ },
3132
+ {
3133
+ "epoch": 0.01644872111193355,
3134
+ "grad_norm": 4.986740589141846,
3135
+ "learning_rate": 4.917798355463577e-05,
3136
+ "loss": 1.5618,
3137
+ "step": 9800
3138
+ },
3139
+ {
3140
+ "epoch": 0.01644872111193355,
3141
+ "eval_loss": 1.3315510749816895,
3142
+ "eval_masked_accuracy": 0.700421929359436,
3143
+ "eval_runtime": 1.7373,
3144
+ "eval_samples_per_second": 5.756,
3145
+ "eval_steps_per_second": 2.302,
3146
+ "step": 9800
3147
+ },
3148
+ {
3149
+ "epoch": 0.016532643158423004,
3150
+ "grad_norm": 7.441061973571777,
3151
+ "learning_rate": 4.9173787452311295e-05,
3152
+ "loss": 1.5428,
3153
+ "step": 9850
3154
+ },
3155
+ {
3156
+ "epoch": 0.016532643158423004,
3157
+ "eval_loss": 1.6381117105484009,
3158
+ "eval_masked_accuracy": 0.6695652008056641,
3159
+ "eval_runtime": 1.7386,
3160
+ "eval_samples_per_second": 5.752,
3161
+ "eval_steps_per_second": 2.301,
3162
+ "step": 9850
3163
+ },
3164
+ {
3165
+ "epoch": 0.016616565204912462,
3166
+ "grad_norm": 6.459640979766846,
3167
+ "learning_rate": 4.9169591349986824e-05,
3168
+ "loss": 1.4702,
3169
+ "step": 9900
3170
+ },
3171
+ {
3172
+ "epoch": 0.016616565204912462,
3173
+ "eval_loss": 1.537841796875,
3174
+ "eval_masked_accuracy": 0.6741573214530945,
3175
+ "eval_runtime": 1.7482,
3176
+ "eval_samples_per_second": 5.72,
3177
+ "eval_steps_per_second": 2.288,
3178
+ "step": 9900
3179
+ },
3180
+ {
3181
+ "epoch": 0.016700487251401917,
3182
+ "grad_norm": 6.058482646942139,
3183
+ "learning_rate": 4.916539524766235e-05,
3184
+ "loss": 1.5765,
3185
+ "step": 9950
3186
+ },
3187
+ {
3188
+ "epoch": 0.016700487251401917,
3189
+ "eval_loss": 1.688913345336914,
3190
+ "eval_masked_accuracy": 0.692307710647583,
3191
+ "eval_runtime": 1.7482,
3192
+ "eval_samples_per_second": 5.72,
3193
+ "eval_steps_per_second": 2.288,
3194
+ "step": 9950
3195
+ },
3196
+ {
3197
+ "epoch": 0.016784409297891375,
3198
+ "grad_norm": 4.960835933685303,
3199
+ "learning_rate": 4.916119914533788e-05,
3200
+ "loss": 1.544,
3201
+ "step": 10000
3202
+ },
3203
+ {
3204
+ "epoch": 0.016784409297891375,
3205
+ "eval_loss": 1.7901655435562134,
3206
+ "eval_masked_accuracy": 0.6443514823913574,
3207
+ "eval_runtime": 1.7882,
3208
+ "eval_samples_per_second": 5.592,
3209
+ "eval_steps_per_second": 2.237,
3210
+ "step": 10000
3211
  }
3212
  ],
3213
  "logging_steps": 50,
 
3227
  "attributes": {}
3228
  }
3229
  },
3230
+ "total_flos": 1.617791736784392e+16,
3231
  "train_batch_size": 2,
3232
  "trial_name": null,
3233
  "trial_params": null