Azrail commited on
Commit
956e051
·
verified ·
1 Parent(s): c3607e3

Training in progress, step 11000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa4c5a2f220199565c612d1b585e05a7b92fff44644cb0d5fe3e2f43506e0e66
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25851b0b62512a3b653e1b28e1122d3212578c5e77ebf5b79e658ffec9b3b79f
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6389d8c90e4661170f94cbc7fb36a5d3b74c5a0f6b13c2a19b9518515670df99
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df743e5d843f3a7837833bec4e1caf6ae5d4bce7ba980e12a541afbc37b034f0
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf18e0901d5cf90634d477e2ea56f1da923039f227c0abc08997f7c74e97f4e1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25b1915a16f6a9d0eca7bf59f4a66ab58a1d3558fffae49f30b6000a597cffb1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00e07de65a0221f25bbdd3fefea6366e38c04a216122618dd72af76e955ce943
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:811fd3ba19eb7a55c539858dcaf05c190bd36b9252f7748cbb128712f2400a11
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.21966043242901878,
6
  "eval_steps": 500,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1788,11 +1788,189 @@
1788
  "eval_steps_per_second": 18.843,
1789
  "num_input_tokens_seen": 10485760000,
1790
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1791
  }
1792
  ],
1793
  "logging_steps": 50,
1794
  "max_steps": 200000,
1795
- "num_input_tokens_seen": 10485760000,
1796
  "num_train_epochs": 5,
1797
  "save_steps": 1000,
1798
  "stateful_callbacks": {
@@ -1807,7 +1985,7 @@
1807
  "attributes": {}
1808
  }
1809
  },
1810
- "total_flos": 5.97171715964928e+18,
1811
  "train_batch_size": 64,
1812
  "trial_name": null,
1813
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.24162647567192067,
6
  "eval_steps": 500,
7
+ "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1788
  "eval_steps_per_second": 18.843,
1789
  "num_input_tokens_seen": 10485760000,
1790
  "step": 10000
1791
+ },
1792
+ {
1793
+ "epoch": 0.22075873459116388,
1794
+ "grad_norm": 0.13463908433914185,
1795
+ "learning_rate": 0.001,
1796
+ "loss": 2.8242,
1797
+ "num_input_tokens_seen": 10538188800,
1798
+ "step": 10050
1799
+ },
1800
+ {
1801
+ "epoch": 0.22185703675330898,
1802
+ "grad_norm": 0.11778156459331512,
1803
+ "learning_rate": 0.001,
1804
+ "loss": 2.8234,
1805
+ "num_input_tokens_seen": 10590617600,
1806
+ "step": 10100
1807
+ },
1808
+ {
1809
+ "epoch": 0.22295533891545408,
1810
+ "grad_norm": 0.11393869668245316,
1811
+ "learning_rate": 0.001,
1812
+ "loss": 2.8204,
1813
+ "num_input_tokens_seen": 10643046400,
1814
+ "step": 10150
1815
+ },
1816
+ {
1817
+ "epoch": 0.22405364107759917,
1818
+ "grad_norm": 0.12454303354024887,
1819
+ "learning_rate": 0.001,
1820
+ "loss": 2.8185,
1821
+ "num_input_tokens_seen": 10695475200,
1822
+ "step": 10200
1823
+ },
1824
+ {
1825
+ "epoch": 0.22515194323974427,
1826
+ "grad_norm": 0.1148439347743988,
1827
+ "learning_rate": 0.001,
1828
+ "loss": 2.8219,
1829
+ "num_input_tokens_seen": 10747904000,
1830
+ "step": 10250
1831
+ },
1832
+ {
1833
+ "epoch": 0.22625024540188934,
1834
+ "grad_norm": 0.13888292014598846,
1835
+ "learning_rate": 0.001,
1836
+ "loss": 2.8157,
1837
+ "num_input_tokens_seen": 10800332800,
1838
+ "step": 10300
1839
+ },
1840
+ {
1841
+ "epoch": 0.22734854756403444,
1842
+ "grad_norm": 0.12242749333381653,
1843
+ "learning_rate": 0.001,
1844
+ "loss": 2.8165,
1845
+ "num_input_tokens_seen": 10852761600,
1846
+ "step": 10350
1847
+ },
1848
+ {
1849
+ "epoch": 0.22844684972617954,
1850
+ "grad_norm": 0.13651017844676971,
1851
+ "learning_rate": 0.001,
1852
+ "loss": 2.8165,
1853
+ "num_input_tokens_seen": 10905190400,
1854
+ "step": 10400
1855
+ },
1856
+ {
1857
+ "epoch": 0.22954515188832464,
1858
+ "grad_norm": 0.12349703162908554,
1859
+ "learning_rate": 0.001,
1860
+ "loss": 2.8126,
1861
+ "num_input_tokens_seen": 10957619200,
1862
+ "step": 10450
1863
+ },
1864
+ {
1865
+ "epoch": 0.23064345405046974,
1866
+ "grad_norm": 0.13448943197727203,
1867
+ "learning_rate": 0.001,
1868
+ "loss": 2.8162,
1869
+ "num_input_tokens_seen": 11010048000,
1870
+ "step": 10500
1871
+ },
1872
+ {
1873
+ "epoch": 0.23064345405046974,
1874
+ "eval_loss": 2.720102071762085,
1875
+ "eval_runtime": 65.0663,
1876
+ "eval_samples_per_second": 76.845,
1877
+ "eval_steps_per_second": 19.211,
1878
+ "num_input_tokens_seen": 11010048000,
1879
+ "step": 10500
1880
+ },
1881
+ {
1882
+ "epoch": 0.23174175621261484,
1883
+ "grad_norm": 0.1171165183186531,
1884
+ "learning_rate": 0.001,
1885
+ "loss": 2.817,
1886
+ "num_input_tokens_seen": 11062476800,
1887
+ "step": 10550
1888
+ },
1889
+ {
1890
+ "epoch": 0.2328400583747599,
1891
+ "grad_norm": 0.1417781263589859,
1892
+ "learning_rate": 0.001,
1893
+ "loss": 2.8159,
1894
+ "num_input_tokens_seen": 11114905600,
1895
+ "step": 10600
1896
+ },
1897
+ {
1898
+ "epoch": 0.233938360536905,
1899
+ "grad_norm": 0.13051685690879822,
1900
+ "learning_rate": 0.001,
1901
+ "loss": 2.8062,
1902
+ "num_input_tokens_seen": 11167334400,
1903
+ "step": 10650
1904
+ },
1905
+ {
1906
+ "epoch": 0.2350366626990501,
1907
+ "grad_norm": 0.12536808848381042,
1908
+ "learning_rate": 0.001,
1909
+ "loss": 2.8166,
1910
+ "num_input_tokens_seen": 11219763200,
1911
+ "step": 10700
1912
+ },
1913
+ {
1914
+ "epoch": 0.2361349648611952,
1915
+ "grad_norm": 0.11859289556741714,
1916
+ "learning_rate": 0.001,
1917
+ "loss": 2.8075,
1918
+ "num_input_tokens_seen": 11272192000,
1919
+ "step": 10750
1920
+ },
1921
+ {
1922
+ "epoch": 0.2372332670233403,
1923
+ "grad_norm": 0.14844287931919098,
1924
+ "learning_rate": 0.001,
1925
+ "loss": 2.8139,
1926
+ "num_input_tokens_seen": 11324620800,
1927
+ "step": 10800
1928
+ },
1929
+ {
1930
+ "epoch": 0.2383315691854854,
1931
+ "grad_norm": 0.12877844274044037,
1932
+ "learning_rate": 0.001,
1933
+ "loss": 2.8031,
1934
+ "num_input_tokens_seen": 11377049600,
1935
+ "step": 10850
1936
+ },
1937
+ {
1938
+ "epoch": 0.23942987134763047,
1939
+ "grad_norm": 0.13911722600460052,
1940
+ "learning_rate": 0.001,
1941
+ "loss": 2.7992,
1942
+ "num_input_tokens_seen": 11429478400,
1943
+ "step": 10900
1944
+ },
1945
+ {
1946
+ "epoch": 0.24052817350977557,
1947
+ "grad_norm": 0.156200110912323,
1948
+ "learning_rate": 0.001,
1949
+ "loss": 2.8059,
1950
+ "num_input_tokens_seen": 11481907200,
1951
+ "step": 10950
1952
+ },
1953
+ {
1954
+ "epoch": 0.24162647567192067,
1955
+ "grad_norm": 0.12990960478782654,
1956
+ "learning_rate": 0.001,
1957
+ "loss": 2.7984,
1958
+ "num_input_tokens_seen": 11534336000,
1959
+ "step": 11000
1960
+ },
1961
+ {
1962
+ "epoch": 0.24162647567192067,
1963
+ "eval_loss": 2.7103493213653564,
1964
+ "eval_runtime": 65.6611,
1965
+ "eval_samples_per_second": 76.149,
1966
+ "eval_steps_per_second": 19.037,
1967
+ "num_input_tokens_seen": 11534336000,
1968
+ "step": 11000
1969
  }
1970
  ],
1971
  "logging_steps": 50,
1972
  "max_steps": 200000,
1973
+ "num_input_tokens_seen": 11534336000,
1974
  "num_train_epochs": 5,
1975
  "save_steps": 1000,
1976
  "stateful_callbacks": {
 
1985
  "attributes": {}
1986
  }
1987
  },
1988
+ "total_flos": 6.568888875614208e+18,
1989
  "train_batch_size": 64,
1990
  "trial_name": null,
1991
  "trial_params": null