FormlessAI commited on
Commit
2eae30a
·
verified ·
1 Parent(s): 206a683

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bea86c2ab21a28debfa3ed6a6043f4ff91636439c9d482b3c8179802becd746
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59c72492ae77112d7498629e7110aa4ead00892b60bfde7a53cdde23e0a2a87e
3
  size 1037269336
last-checkpoint/global_step1250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a598037ebfc908303cf6ea87f5e2e4d74e1d0a842707dac6c0e72ffee56ebf
3
+ size 781993445
last-checkpoint/global_step1250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4da36f680835be6833e37a7b6ab4e2817cf05e794e70756229830efd1b9e3268
3
+ size 781993509
last-checkpoint/global_step1250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de3ca49195600fd131e42997f55ea96b107daad60b29d006f9d752f71ba394bb
3
+ size 781993509
last-checkpoint/global_step1250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851352d483715eb54acb2f0c123de01444d7cb9f689e5caf230f1e429a0c5501
3
+ size 781993509
last-checkpoint/global_step1250/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4398edb588eb56d41bac2a470bd3160de1fdebaa7886521e47edec8b64e533
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step1150
 
1
+ global_step1250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6726cdeb8b70075f075ae39b467e92c8ad954fb2d6ecb28760d5c40cfe202578
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed87dcb091d4ef7e28c34173b3e5e817c8a65a26c060e643a15f114db3b0387e
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a635b69b5296666e1b8af251cb13a0268b3862ee34bb7486ba5d61defc337940
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f34f165e5ce4e6a030cf3446153db3218902f01675bb6ef508a5d91da25fb4b4
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fd785ba2444adcae48af89ab3bd88555ca86f4ed8acc791f354c989d26cb509
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa19ff1a67f27b22564aa2ddebd6a615ac92d0b0794aa763662b482303827931
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5459584bc0926c43e29fd8f948b9ac16d06294aa8b1f7964cb467b6d9074bf87
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fff90a5aceb3cd4a5999415d57df5f60aeb2a804a347e1c874416d7c196e1499
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88efbcbaac7d45a687904255e7dda5f49605f5a5bc135e51d8f7aad51664a870
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d49af4b0761a2e15e6280ed21708d43c8b8fb5531bab12134da87b28369ed4b
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 2.1799721717834473,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.16717546154964386,
6
  "eval_steps": 50,
7
- "global_step": 1150,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1802,6 +1802,162 @@
1802
  "eval_samples_per_second": 175.223,
1803
  "eval_steps_per_second": 10.988,
1804
  "step": 1150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1805
  }
1806
  ],
1807
  "logging_steps": 5,
@@ -1830,7 +1986,7 @@
1830
  "attributes": {}
1831
  }
1832
  },
1833
- "total_flos": 3.0000328993459405e+17,
1834
  "train_batch_size": 4,
1835
  "trial_name": null,
1836
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 2.1646382808685303,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.1817124582061346,
6
  "eval_steps": 50,
7
+ "global_step": 1250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1802
  "eval_samples_per_second": 175.223,
1803
  "eval_steps_per_second": 10.988,
1804
  "step": 1150
1805
+ },
1806
+ {
1807
+ "epoch": 0.1679023113824684,
1808
+ "grad_norm": 2.660989761352539,
1809
+ "learning_rate": 9.742052914999266e-05,
1810
+ "loss": 2.4101,
1811
+ "step": 1155
1812
+ },
1813
+ {
1814
+ "epoch": 0.16862916121529292,
1815
+ "grad_norm": 2.388467788696289,
1816
+ "learning_rate": 9.73943758877837e-05,
1817
+ "loss": 2.4527,
1818
+ "step": 1160
1819
+ },
1820
+ {
1821
+ "epoch": 0.16935601104811746,
1822
+ "grad_norm": 2.646723508834839,
1823
+ "learning_rate": 9.736810355793018e-05,
1824
+ "loss": 2.3153,
1825
+ "step": 1165
1826
+ },
1827
+ {
1828
+ "epoch": 0.170082860880942,
1829
+ "grad_norm": 2.3404433727264404,
1830
+ "learning_rate": 9.734171222657268e-05,
1831
+ "loss": 2.1586,
1832
+ "step": 1170
1833
+ },
1834
+ {
1835
+ "epoch": 0.17080971071376655,
1836
+ "grad_norm": 2.457658529281616,
1837
+ "learning_rate": 9.731520196015136e-05,
1838
+ "loss": 2.2344,
1839
+ "step": 1175
1840
+ },
1841
+ {
1842
+ "epoch": 0.17153656054659108,
1843
+ "grad_norm": 2.6175413131713867,
1844
+ "learning_rate": 9.728857282540573e-05,
1845
+ "loss": 2.0764,
1846
+ "step": 1180
1847
+ },
1848
+ {
1849
+ "epoch": 0.17226341037941562,
1850
+ "grad_norm": 2.3355302810668945,
1851
+ "learning_rate": 9.726182488937464e-05,
1852
+ "loss": 2.0514,
1853
+ "step": 1185
1854
+ },
1855
+ {
1856
+ "epoch": 0.17299026021224015,
1857
+ "grad_norm": 2.3462462425231934,
1858
+ "learning_rate": 9.7234958219396e-05,
1859
+ "loss": 2.357,
1860
+ "step": 1190
1861
+ },
1862
+ {
1863
+ "epoch": 0.17371711004506468,
1864
+ "grad_norm": 2.643761396408081,
1865
+ "learning_rate": 9.720797288310659e-05,
1866
+ "loss": 2.2057,
1867
+ "step": 1195
1868
+ },
1869
+ {
1870
+ "epoch": 0.17444395987788922,
1871
+ "grad_norm": 2.2287981510162354,
1872
+ "learning_rate": 9.718086894844198e-05,
1873
+ "loss": 2.2767,
1874
+ "step": 1200
1875
+ },
1876
+ {
1877
+ "epoch": 0.17444395987788922,
1878
+ "eval_loss": 2.1676223278045654,
1879
+ "eval_runtime": 22.4479,
1880
+ "eval_samples_per_second": 147.052,
1881
+ "eval_steps_per_second": 9.221,
1882
+ "step": 1200
1883
+ },
1884
+ {
1885
+ "epoch": 0.17517080971071378,
1886
+ "grad_norm": 2.403559684753418,
1887
+ "learning_rate": 9.71536464836363e-05,
1888
+ "loss": 2.4449,
1889
+ "step": 1205
1890
+ },
1891
+ {
1892
+ "epoch": 0.1758976595435383,
1893
+ "grad_norm": 2.6444735527038574,
1894
+ "learning_rate": 9.712630555722204e-05,
1895
+ "loss": 2.0928,
1896
+ "step": 1210
1897
+ },
1898
+ {
1899
+ "epoch": 0.17662450937636284,
1900
+ "grad_norm": 2.7171778678894043,
1901
+ "learning_rate": 9.709884623802998e-05,
1902
+ "loss": 2.3201,
1903
+ "step": 1215
1904
+ },
1905
+ {
1906
+ "epoch": 0.17735135920918738,
1907
+ "grad_norm": 2.6349828243255615,
1908
+ "learning_rate": 9.707126859518893e-05,
1909
+ "loss": 2.2294,
1910
+ "step": 1220
1911
+ },
1912
+ {
1913
+ "epoch": 0.1780782090420119,
1914
+ "grad_norm": 2.41853928565979,
1915
+ "learning_rate": 9.704357269812553e-05,
1916
+ "loss": 2.2768,
1917
+ "step": 1225
1918
+ },
1919
+ {
1920
+ "epoch": 0.17880505887483647,
1921
+ "grad_norm": 2.3369898796081543,
1922
+ "learning_rate": 9.701575861656423e-05,
1923
+ "loss": 2.2812,
1924
+ "step": 1230
1925
+ },
1926
+ {
1927
+ "epoch": 0.179531908707661,
1928
+ "grad_norm": 2.3878183364868164,
1929
+ "learning_rate": 9.698782642052687e-05,
1930
+ "loss": 2.2777,
1931
+ "step": 1235
1932
+ },
1933
+ {
1934
+ "epoch": 0.18025875854048554,
1935
+ "grad_norm": 2.5290329456329346,
1936
+ "learning_rate": 9.695977618033281e-05,
1937
+ "loss": 2.2638,
1938
+ "step": 1240
1939
+ },
1940
+ {
1941
+ "epoch": 0.18098560837331007,
1942
+ "grad_norm": 2.4412193298339844,
1943
+ "learning_rate": 9.693160796659841e-05,
1944
+ "loss": 2.208,
1945
+ "step": 1245
1946
+ },
1947
+ {
1948
+ "epoch": 0.1817124582061346,
1949
+ "grad_norm": 2.3451461791992188,
1950
+ "learning_rate": 9.690332185023718e-05,
1951
+ "loss": 2.1151,
1952
+ "step": 1250
1953
+ },
1954
+ {
1955
+ "epoch": 0.1817124582061346,
1956
+ "eval_loss": 2.1646382808685303,
1957
+ "eval_runtime": 18.8171,
1958
+ "eval_samples_per_second": 175.426,
1959
+ "eval_steps_per_second": 11.001,
1960
+ "step": 1250
1961
  }
1962
  ],
1963
  "logging_steps": 5,
 
1986
  "attributes": {}
1987
  }
1988
  },
1989
+ "total_flos": 3.259402418923766e+17,
1990
  "train_batch_size": 4,
1991
  "trial_name": null,
1992
  "trial_params": null