ljcamargo commited on
Commit
8207bec
·
verified ·
1 Parent(s): 15f5943

Training in progress, step 2700, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c8c03bddde1d45b42f156ee9380731c978903eeb514446180e27d130995337d
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec39b6059f46dd5c028a2b3a8df89e54652f47a7b2d1b473858cb9613ea2bf35
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a2d8576045f18ebc0d44a01c8bb87c6bcc68dc93cada4a8ed13a6805a28e50a
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c900497bf3dbda1d6e7c1b32cf2a719ed87675df67744d76a636bb793c4be6
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdd2d9b8a329c8bdf157e8302d4758961f9a282c3f0127e29e492f0c374d2cc5
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f936c4340b1a5e33087b6159d8f0cde321033f9a21edc5ffdda56dd518d57d1d
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a82daf79aef8e8b5ecd74ff5d2377b7a09a1c4d4504ecc0c2a12006214be596b
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29254c2526b30c1f020401ec71783f99885e5c23773b0ea29681c66ec8089ebb
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f95bb9c4b14269e2ef89bd678ab3c3d4b5f143d243a24d6ece8108f7e85154f8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd91190946d7dc5a14f47d6b938cddd6477162a42282961cbb0f0f14b153eef3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6887645286267757,
6
  "eval_steps": 300,
7
- "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1688,6 +1688,216 @@
1688
  "learning_rate": 4.60977866504668e-05,
1689
  "loss": 0.7796,
1690
  "step": 2400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1691
  }
1692
  ],
1693
  "logging_steps": 10,
@@ -1707,7 +1917,7 @@
1707
  "attributes": {}
1708
  }
1709
  },
1710
- "total_flos": 9.8205186392064e+19,
1711
  "train_batch_size": 6,
1712
  "trial_name": null,
1713
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7748600947051227,
6
  "eval_steps": 300,
7
+ "global_step": 2700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1688
  "learning_rate": 4.60977866504668e-05,
1689
  "loss": 0.7796,
1690
  "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.6916343808293873,
1694
+ "grad_norm": 5.168239593505859,
1695
+ "learning_rate": 4.5325446759871316e-05,
1696
+ "loss": 0.7764,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.6945042330319988,
1701
+ "grad_norm": 3.202075958251953,
1702
+ "learning_rate": 4.455773117142965e-05,
1703
+ "loss": 0.7483,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.6973740852346104,
1708
+ "grad_norm": 4.126010417938232,
1709
+ "learning_rate": 4.379470481752139e-05,
1710
+ "loss": 0.7702,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.700243937437222,
1715
+ "grad_norm": 5.2914509773254395,
1716
+ "learning_rate": 4.303643223391698e-05,
1717
+ "loss": 0.7663,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.7031137896398335,
1722
+ "grad_norm": 5.010975360870361,
1723
+ "learning_rate": 4.2282977554319034e-05,
1724
+ "loss": 0.7911,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7059836418424451,
1729
+ "grad_norm": 3.504735231399536,
1730
+ "learning_rate": 4.153440450493823e-05,
1731
+ "loss": 0.7452,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7088534940450567,
1736
+ "grad_norm": 5.5859880447387695,
1737
+ "learning_rate": 4.0790776399103294e-05,
1738
+ "loss": 0.758,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7117233462476682,
1743
+ "grad_norm": 6.027501583099365,
1744
+ "learning_rate": 4.0052156131906214e-05,
1745
+ "loss": 0.7945,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7145931984502798,
1750
+ "grad_norm": 5.546058654785156,
1751
+ "learning_rate": 3.93186061748824e-05,
1752
+ "loss": 0.7676,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.7174630506528914,
1757
+ "grad_norm": 4.879994869232178,
1758
+ "learning_rate": 3.859018857072719e-05,
1759
+ "loss": 0.7926,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.7203329028555029,
1764
+ "grad_norm": 4.717655181884766,
1765
+ "learning_rate": 3.786696492804812e-05,
1766
+ "loss": 0.7451,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.7232027550581145,
1771
+ "grad_norm": 6.432432174682617,
1772
+ "learning_rate": 3.714899641615438e-05,
1773
+ "loss": 0.7938,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.7260726072607261,
1778
+ "grad_norm": 5.008986473083496,
1779
+ "learning_rate": 3.6436343759882926e-05,
1780
+ "loss": 0.765,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.7289424594633377,
1785
+ "grad_norm": 7.00074577331543,
1786
+ "learning_rate": 3.5729067234462785e-05,
1787
+ "loss": 0.7794,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.7318123116659492,
1792
+ "grad_norm": 6.525863170623779,
1793
+ "learning_rate": 3.5027226660416736e-05,
1794
+ "loss": 0.7979,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.7346821638685608,
1799
+ "grad_norm": 5.4863786697387695,
1800
+ "learning_rate": 3.433088139850193e-05,
1801
+ "loss": 0.7625,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.7375520160711724,
1806
+ "grad_norm": 3.975086212158203,
1807
+ "learning_rate": 3.364009034468926e-05,
1808
+ "loss": 0.7471,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.7404218682737839,
1813
+ "grad_norm": 3.787874460220337,
1814
+ "learning_rate": 3.2954911925181876e-05,
1815
+ "loss": 0.7662,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.7432917204763955,
1820
+ "grad_norm": 4.633001804351807,
1821
+ "learning_rate": 3.2275404091473795e-05,
1822
+ "loss": 0.774,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.7461615726790071,
1827
+ "grad_norm": 4.832580089569092,
1828
+ "learning_rate": 3.1601624315448166e-05,
1829
+ "loss": 0.7749,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.7490314248816186,
1834
+ "grad_norm": 4.763906955718994,
1835
+ "learning_rate": 3.0933629584516665e-05,
1836
+ "loss": 0.7438,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.7519012770842302,
1841
+ "grad_norm": 4.065663814544678,
1842
+ "learning_rate": 3.027147639679928e-05,
1843
+ "loss": 0.7546,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.7547711292868418,
1848
+ "grad_norm": 4.496669769287109,
1849
+ "learning_rate": 2.961522075634604e-05,
1850
+ "loss": 0.7878,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.7576409814894532,
1855
+ "grad_norm": 3.8822827339172363,
1856
+ "learning_rate": 2.896491816840008e-05,
1857
+ "loss": 0.7884,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.7605108336920648,
1862
+ "grad_norm": 4.25615119934082,
1863
+ "learning_rate": 2.8320623634703147e-05,
1864
+ "loss": 0.7418,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.7633806858946764,
1869
+ "grad_norm": 4.472879886627197,
1870
+ "learning_rate": 2.76823916488436e-05,
1871
+ "loss": 0.7944,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.7662505380972879,
1876
+ "grad_norm": 6.644125938415527,
1877
+ "learning_rate": 2.705027619164754e-05,
1878
+ "loss": 0.7525,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.7691203902998995,
1883
+ "grad_norm": 3.8960325717926025,
1884
+ "learning_rate": 2.6424330726612946e-05,
1885
+ "loss": 0.748,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.7719902425025111,
1890
+ "grad_norm": 3.907740354537964,
1891
+ "learning_rate": 2.5804608195388057e-05,
1892
+ "loss": 0.7686,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.7748600947051227,
1897
+ "grad_norm": 4.432440757751465,
1898
+ "learning_rate": 2.5191161013293396e-05,
1899
+ "loss": 0.7671,
1900
+ "step": 2700
1901
  }
1902
  ],
1903
  "logging_steps": 10,
 
1917
  "attributes": {}
1918
  }
1919
  },
1920
+ "total_flos": 1.10480834691072e+20,
1921
  "train_batch_size": 6,
1922
  "trial_name": null,
1923
  "trial_params": null