vrund1346 commited on
Commit
46f435b
·
verified ·
1 Parent(s): a60ed7d

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scaler.pt +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +1799 -8
  7. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b8c22d6220c0fe2b2de3f57d272ce4b5cb98bda61bc4ac7517c15881a305922
3
  size 378311620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90dec33ca25376d7fab8f5635e6bc543ed75144bb3ab10bcb665ac467671fcac
3
  size 378311620
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d86c5907323c06b1608134d45a800b9d612452f34092afbde0a84ca5d84afc8
3
  size 4771843
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da58438b9f9a5189e0457a9df10c78a966d56673489be0f2fb4f6ff067b812d0
3
  size 4771843
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b09097c1953912c5807314e9f0a1a4766cc4f0aaf84c36d549545d82ce75f791
3
  size 14308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3777018ee5c1ad8c7475ebd82d76f78b442de355ff486eb51464ae8490de4e40
3
  size 14308
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0617efbfbb7fc93b69f89ca8af67f24948e1b623a640583096a01c1e14d1a779
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a65a95fc061242cb0ca7b260e068dca633d7bc3d64ccc03eab3e798e52854b
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36e97e9eaaa508c9e09406ab76f0eef3b7e362bc390058bc654ab1cf816f9cfc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb692e54745ea187a8c3a1d90a6ad4458fb449ad0862cc16a6e69e0e4cdc10a8
3
  size 1064
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 5049,
3
- "best_metric": 0.36363636363636365,
4
- "best_model_checkpoint": "./wav2vec2_accent_classification_exp2/checkpoint-5049",
5
- "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 5049,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1799,12 +1799,1803 @@
1799
  "eval_samples_per_second": 2.822,
1800
  "eval_steps_per_second": 2.822,
1801
  "step": 5049
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1802
  }
1803
  ],
1804
  "logging_steps": 20,
1805
- "max_steps": 5049,
1806
  "num_input_tokens_seen": 0,
1807
- "num_train_epochs": 3,
1808
  "save_steps": 500,
1809
  "stateful_callbacks": {
1810
  "TrainerControl": {
@@ -1818,7 +3609,7 @@
1818
  "attributes": {}
1819
  }
1820
  },
1821
- "total_flos": 1.2716768052351475e+18,
1822
  "train_batch_size": 1,
1823
  "trial_name": null,
1824
  "trial_params": null
 
1
  {
2
+ "best_global_step": 6732,
3
+ "best_metric": 0.4074074074074074,
4
+ "best_model_checkpoint": "./wav2vec2_accent_classification_exp2/checkpoint-6732",
5
+ "epoch": 6.0,
6
  "eval_steps": 500,
7
+ "global_step": 10098,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1799
  "eval_samples_per_second": 2.822,
1800
  "eval_steps_per_second": 2.822,
1801
  "step": 5049
1802
+ },
1803
+ {
1804
+ "epoch": 3.0065359477124183,
1805
+ "grad_norm": 5.259908676147461,
1806
+ "learning_rate": 5.549075704225353e-05,
1807
+ "loss": 1.3437,
1808
+ "step": 5060
1809
+ },
1810
+ {
1811
+ "epoch": 3.0184194890077243,
1812
+ "grad_norm": 14.064613342285156,
1813
+ "learning_rate": 5.5270686619718314e-05,
1814
+ "loss": 1.43,
1815
+ "step": 5080
1816
+ },
1817
+ {
1818
+ "epoch": 3.0303030303030303,
1819
+ "grad_norm": 14.920954704284668,
1820
+ "learning_rate": 5.5050616197183105e-05,
1821
+ "loss": 1.8756,
1822
+ "step": 5100
1823
+ },
1824
+ {
1825
+ "epoch": 3.0421865715983363,
1826
+ "grad_norm": 17.76242446899414,
1827
+ "learning_rate": 5.483054577464789e-05,
1828
+ "loss": 1.3734,
1829
+ "step": 5120
1830
+ },
1831
+ {
1832
+ "epoch": 3.0540701128936423,
1833
+ "grad_norm": 9.35627269744873,
1834
+ "learning_rate": 5.461047535211268e-05,
1835
+ "loss": 1.4487,
1836
+ "step": 5140
1837
+ },
1838
+ {
1839
+ "epoch": 3.0659536541889483,
1840
+ "grad_norm": 5.479264259338379,
1841
+ "learning_rate": 5.4390404929577464e-05,
1842
+ "loss": 1.439,
1843
+ "step": 5160
1844
+ },
1845
+ {
1846
+ "epoch": 3.0778371954842543,
1847
+ "grad_norm": 5.219712257385254,
1848
+ "learning_rate": 5.4170334507042255e-05,
1849
+ "loss": 1.5694,
1850
+ "step": 5180
1851
+ },
1852
+ {
1853
+ "epoch": 3.0897207367795603,
1854
+ "grad_norm": 9.071990966796875,
1855
+ "learning_rate": 5.395026408450704e-05,
1856
+ "loss": 1.6675,
1857
+ "step": 5200
1858
+ },
1859
+ {
1860
+ "epoch": 3.1016042780748663,
1861
+ "grad_norm": 10.13392448425293,
1862
+ "learning_rate": 5.373019366197183e-05,
1863
+ "loss": 1.3387,
1864
+ "step": 5220
1865
+ },
1866
+ {
1867
+ "epoch": 3.1134878193701723,
1868
+ "grad_norm": 11.0034818649292,
1869
+ "learning_rate": 5.351012323943663e-05,
1870
+ "loss": 1.5826,
1871
+ "step": 5240
1872
+ },
1873
+ {
1874
+ "epoch": 3.1253713606654783,
1875
+ "grad_norm": 13.366768836975098,
1876
+ "learning_rate": 5.329005281690141e-05,
1877
+ "loss": 1.4869,
1878
+ "step": 5260
1879
+ },
1880
+ {
1881
+ "epoch": 3.1372549019607843,
1882
+ "grad_norm": 3.3045759201049805,
1883
+ "learning_rate": 5.30699823943662e-05,
1884
+ "loss": 1.0267,
1885
+ "step": 5280
1886
+ },
1887
+ {
1888
+ "epoch": 3.1491384432560903,
1889
+ "grad_norm": 7.309617042541504,
1890
+ "learning_rate": 5.284991197183099e-05,
1891
+ "loss": 1.408,
1892
+ "step": 5300
1893
+ },
1894
+ {
1895
+ "epoch": 3.1610219845513963,
1896
+ "grad_norm": 9.015344619750977,
1897
+ "learning_rate": 5.262984154929578e-05,
1898
+ "loss": 1.6,
1899
+ "step": 5320
1900
+ },
1901
+ {
1902
+ "epoch": 3.1729055258467023,
1903
+ "grad_norm": 0.17147333920001984,
1904
+ "learning_rate": 5.240977112676056e-05,
1905
+ "loss": 1.5795,
1906
+ "step": 5340
1907
+ },
1908
+ {
1909
+ "epoch": 3.1847890671420083,
1910
+ "grad_norm": 11.151992797851562,
1911
+ "learning_rate": 5.218970070422535e-05,
1912
+ "loss": 1.2323,
1913
+ "step": 5360
1914
+ },
1915
+ {
1916
+ "epoch": 3.1966726084373143,
1917
+ "grad_norm": 2.905815839767456,
1918
+ "learning_rate": 5.196963028169014e-05,
1919
+ "loss": 1.3181,
1920
+ "step": 5380
1921
+ },
1922
+ {
1923
+ "epoch": 3.2085561497326203,
1924
+ "grad_norm": 7.193173885345459,
1925
+ "learning_rate": 5.174955985915493e-05,
1926
+ "loss": 1.5094,
1927
+ "step": 5400
1928
+ },
1929
+ {
1930
+ "epoch": 3.2204396910279263,
1931
+ "grad_norm": 8.543306350708008,
1932
+ "learning_rate": 5.152948943661971e-05,
1933
+ "loss": 1.4647,
1934
+ "step": 5420
1935
+ },
1936
+ {
1937
+ "epoch": 3.2323232323232323,
1938
+ "grad_norm": 13.816205978393555,
1939
+ "learning_rate": 5.130941901408452e-05,
1940
+ "loss": 1.6882,
1941
+ "step": 5440
1942
+ },
1943
+ {
1944
+ "epoch": 3.2442067736185383,
1945
+ "grad_norm": 4.082400798797607,
1946
+ "learning_rate": 5.10893485915493e-05,
1947
+ "loss": 1.6905,
1948
+ "step": 5460
1949
+ },
1950
+ {
1951
+ "epoch": 3.2560903149138443,
1952
+ "grad_norm": 17.725006103515625,
1953
+ "learning_rate": 5.086927816901409e-05,
1954
+ "loss": 1.4803,
1955
+ "step": 5480
1956
+ },
1957
+ {
1958
+ "epoch": 3.2679738562091503,
1959
+ "grad_norm": 14.433451652526855,
1960
+ "learning_rate": 5.0649207746478876e-05,
1961
+ "loss": 1.3783,
1962
+ "step": 5500
1963
+ },
1964
+ {
1965
+ "epoch": 3.2798573975044563,
1966
+ "grad_norm": 13.463130950927734,
1967
+ "learning_rate": 5.042913732394367e-05,
1968
+ "loss": 1.5348,
1969
+ "step": 5520
1970
+ },
1971
+ {
1972
+ "epoch": 3.2917409387997623,
1973
+ "grad_norm": 6.446109771728516,
1974
+ "learning_rate": 5.020906690140845e-05,
1975
+ "loss": 1.805,
1976
+ "step": 5540
1977
+ },
1978
+ {
1979
+ "epoch": 3.3036244800950683,
1980
+ "grad_norm": 10.89792251586914,
1981
+ "learning_rate": 4.998899647887324e-05,
1982
+ "loss": 1.5201,
1983
+ "step": 5560
1984
+ },
1985
+ {
1986
+ "epoch": 3.3155080213903743,
1987
+ "grad_norm": 12.57170581817627,
1988
+ "learning_rate": 4.976892605633803e-05,
1989
+ "loss": 1.5706,
1990
+ "step": 5580
1991
+ },
1992
+ {
1993
+ "epoch": 3.3273915626856803,
1994
+ "grad_norm": 5.958362102508545,
1995
+ "learning_rate": 4.9548855633802824e-05,
1996
+ "loss": 1.4606,
1997
+ "step": 5600
1998
+ },
1999
+ {
2000
+ "epoch": 3.3392751039809863,
2001
+ "grad_norm": 12.095161437988281,
2002
+ "learning_rate": 4.932878521126761e-05,
2003
+ "loss": 1.5633,
2004
+ "step": 5620
2005
+ },
2006
+ {
2007
+ "epoch": 3.3511586452762923,
2008
+ "grad_norm": 17.618438720703125,
2009
+ "learning_rate": 4.91087147887324e-05,
2010
+ "loss": 1.5763,
2011
+ "step": 5640
2012
+ },
2013
+ {
2014
+ "epoch": 3.3630421865715983,
2015
+ "grad_norm": 5.972578048706055,
2016
+ "learning_rate": 4.888864436619718e-05,
2017
+ "loss": 1.0006,
2018
+ "step": 5660
2019
+ },
2020
+ {
2021
+ "epoch": 3.3749257278669043,
2022
+ "grad_norm": 11.534244537353516,
2023
+ "learning_rate": 4.8668573943661974e-05,
2024
+ "loss": 1.9326,
2025
+ "step": 5680
2026
+ },
2027
+ {
2028
+ "epoch": 3.3868092691622103,
2029
+ "grad_norm": 7.13762092590332,
2030
+ "learning_rate": 4.8448503521126765e-05,
2031
+ "loss": 1.369,
2032
+ "step": 5700
2033
+ },
2034
+ {
2035
+ "epoch": 3.3986928104575163,
2036
+ "grad_norm": 5.443957328796387,
2037
+ "learning_rate": 4.822843309859155e-05,
2038
+ "loss": 1.6741,
2039
+ "step": 5720
2040
+ },
2041
+ {
2042
+ "epoch": 3.4105763517528223,
2043
+ "grad_norm": 5.366059303283691,
2044
+ "learning_rate": 4.800836267605634e-05,
2045
+ "loss": 1.6731,
2046
+ "step": 5740
2047
+ },
2048
+ {
2049
+ "epoch": 3.4224598930481283,
2050
+ "grad_norm": 6.048867225646973,
2051
+ "learning_rate": 4.7788292253521124e-05,
2052
+ "loss": 1.3663,
2053
+ "step": 5760
2054
+ },
2055
+ {
2056
+ "epoch": 3.4343434343434343,
2057
+ "grad_norm": 14.81528377532959,
2058
+ "learning_rate": 4.7568221830985915e-05,
2059
+ "loss": 2.319,
2060
+ "step": 5780
2061
+ },
2062
+ {
2063
+ "epoch": 3.4462269756387403,
2064
+ "grad_norm": 6.7229814529418945,
2065
+ "learning_rate": 4.7348151408450706e-05,
2066
+ "loss": 1.5563,
2067
+ "step": 5800
2068
+ },
2069
+ {
2070
+ "epoch": 3.4581105169340463,
2071
+ "grad_norm": 7.397293567657471,
2072
+ "learning_rate": 4.71280809859155e-05,
2073
+ "loss": 1.6003,
2074
+ "step": 5820
2075
+ },
2076
+ {
2077
+ "epoch": 3.4699940582293523,
2078
+ "grad_norm": 6.6656718254089355,
2079
+ "learning_rate": 4.690801056338028e-05,
2080
+ "loss": 1.0575,
2081
+ "step": 5840
2082
+ },
2083
+ {
2084
+ "epoch": 3.4818775995246583,
2085
+ "grad_norm": 12.698165893554688,
2086
+ "learning_rate": 4.668794014084507e-05,
2087
+ "loss": 1.7236,
2088
+ "step": 5860
2089
+ },
2090
+ {
2091
+ "epoch": 3.4937611408199643,
2092
+ "grad_norm": 5.691989898681641,
2093
+ "learning_rate": 4.6467869718309856e-05,
2094
+ "loss": 1.3636,
2095
+ "step": 5880
2096
+ },
2097
+ {
2098
+ "epoch": 3.5056446821152702,
2099
+ "grad_norm": 5.1549153327941895,
2100
+ "learning_rate": 4.6247799295774654e-05,
2101
+ "loss": 1.0757,
2102
+ "step": 5900
2103
+ },
2104
+ {
2105
+ "epoch": 3.5175282234105762,
2106
+ "grad_norm": 6.807792663574219,
2107
+ "learning_rate": 4.602772887323944e-05,
2108
+ "loss": 1.4264,
2109
+ "step": 5920
2110
+ },
2111
+ {
2112
+ "epoch": 3.5294117647058822,
2113
+ "grad_norm": 11.486824035644531,
2114
+ "learning_rate": 4.580765845070423e-05,
2115
+ "loss": 1.5486,
2116
+ "step": 5940
2117
+ },
2118
+ {
2119
+ "epoch": 3.5412953060011882,
2120
+ "grad_norm": 19.278827667236328,
2121
+ "learning_rate": 4.558758802816901e-05,
2122
+ "loss": 1.8427,
2123
+ "step": 5960
2124
+ },
2125
+ {
2126
+ "epoch": 3.5531788472964942,
2127
+ "grad_norm": 8.828113555908203,
2128
+ "learning_rate": 4.5367517605633804e-05,
2129
+ "loss": 1.6961,
2130
+ "step": 5980
2131
+ },
2132
+ {
2133
+ "epoch": 3.5650623885918002,
2134
+ "grad_norm": 6.939361095428467,
2135
+ "learning_rate": 4.5147447183098595e-05,
2136
+ "loss": 1.4189,
2137
+ "step": 6000
2138
+ },
2139
+ {
2140
+ "epoch": 3.5769459298871062,
2141
+ "grad_norm": 8.03252124786377,
2142
+ "learning_rate": 4.4927376760563386e-05,
2143
+ "loss": 1.5236,
2144
+ "step": 6020
2145
+ },
2146
+ {
2147
+ "epoch": 3.5888294711824122,
2148
+ "grad_norm": 14.845807075500488,
2149
+ "learning_rate": 4.470730633802817e-05,
2150
+ "loss": 1.3323,
2151
+ "step": 6040
2152
+ },
2153
+ {
2154
+ "epoch": 3.6007130124777182,
2155
+ "grad_norm": 6.234992504119873,
2156
+ "learning_rate": 4.448723591549296e-05,
2157
+ "loss": 1.5554,
2158
+ "step": 6060
2159
+ },
2160
+ {
2161
+ "epoch": 3.6125965537730242,
2162
+ "grad_norm": 9.308655738830566,
2163
+ "learning_rate": 4.4278169014084505e-05,
2164
+ "loss": 1.7403,
2165
+ "step": 6080
2166
+ },
2167
+ {
2168
+ "epoch": 3.6244800950683302,
2169
+ "grad_norm": 6.452903747558594,
2170
+ "learning_rate": 4.4058098591549296e-05,
2171
+ "loss": 1.4449,
2172
+ "step": 6100
2173
+ },
2174
+ {
2175
+ "epoch": 3.6363636363636362,
2176
+ "grad_norm": 5.176253318786621,
2177
+ "learning_rate": 4.383802816901409e-05,
2178
+ "loss": 1.7978,
2179
+ "step": 6120
2180
+ },
2181
+ {
2182
+ "epoch": 3.6482471776589422,
2183
+ "grad_norm": 4.04074764251709,
2184
+ "learning_rate": 4.361795774647888e-05,
2185
+ "loss": 1.444,
2186
+ "step": 6140
2187
+ },
2188
+ {
2189
+ "epoch": 3.6601307189542482,
2190
+ "grad_norm": 0.018241392448544502,
2191
+ "learning_rate": 4.339788732394366e-05,
2192
+ "loss": 1.2793,
2193
+ "step": 6160
2194
+ },
2195
+ {
2196
+ "epoch": 3.6720142602495542,
2197
+ "grad_norm": 25.371074676513672,
2198
+ "learning_rate": 4.317781690140845e-05,
2199
+ "loss": 1.6718,
2200
+ "step": 6180
2201
+ },
2202
+ {
2203
+ "epoch": 3.6838978015448602,
2204
+ "grad_norm": 9.880302429199219,
2205
+ "learning_rate": 4.295774647887324e-05,
2206
+ "loss": 1.7344,
2207
+ "step": 6200
2208
+ },
2209
+ {
2210
+ "epoch": 3.6957813428401662,
2211
+ "grad_norm": 17.729167938232422,
2212
+ "learning_rate": 4.2737676056338035e-05,
2213
+ "loss": 1.6367,
2214
+ "step": 6220
2215
+ },
2216
+ {
2217
+ "epoch": 3.7076648841354722,
2218
+ "grad_norm": 7.4893798828125,
2219
+ "learning_rate": 4.251760563380282e-05,
2220
+ "loss": 1.2116,
2221
+ "step": 6240
2222
+ },
2223
+ {
2224
+ "epoch": 3.7195484254307782,
2225
+ "grad_norm": 7.023115158081055,
2226
+ "learning_rate": 4.229753521126761e-05,
2227
+ "loss": 1.2528,
2228
+ "step": 6260
2229
+ },
2230
+ {
2231
+ "epoch": 3.7314319667260842,
2232
+ "grad_norm": 1.672008752822876,
2233
+ "learning_rate": 4.2077464788732394e-05,
2234
+ "loss": 1.3507,
2235
+ "step": 6280
2236
+ },
2237
+ {
2238
+ "epoch": 3.7433155080213902,
2239
+ "grad_norm": 7.472415924072266,
2240
+ "learning_rate": 4.1857394366197185e-05,
2241
+ "loss": 1.5015,
2242
+ "step": 6300
2243
+ },
2244
+ {
2245
+ "epoch": 3.7551990493166962,
2246
+ "grad_norm": 11.470372200012207,
2247
+ "learning_rate": 4.163732394366197e-05,
2248
+ "loss": 1.0308,
2249
+ "step": 6320
2250
+ },
2251
+ {
2252
+ "epoch": 3.7670825906120022,
2253
+ "grad_norm": 8.578750610351562,
2254
+ "learning_rate": 4.141725352112677e-05,
2255
+ "loss": 1.3318,
2256
+ "step": 6340
2257
+ },
2258
+ {
2259
+ "epoch": 3.7789661319073082,
2260
+ "grad_norm": 7.921478748321533,
2261
+ "learning_rate": 4.119718309859155e-05,
2262
+ "loss": 1.5459,
2263
+ "step": 6360
2264
+ },
2265
+ {
2266
+ "epoch": 3.7908496732026142,
2267
+ "grad_norm": 14.853532791137695,
2268
+ "learning_rate": 4.097711267605634e-05,
2269
+ "loss": 1.8492,
2270
+ "step": 6380
2271
+ },
2272
+ {
2273
+ "epoch": 3.8027332144979202,
2274
+ "grad_norm": 24.788694381713867,
2275
+ "learning_rate": 4.0757042253521126e-05,
2276
+ "loss": 1.8012,
2277
+ "step": 6400
2278
+ },
2279
+ {
2280
+ "epoch": 3.8146167557932262,
2281
+ "grad_norm": 2.027362108230591,
2282
+ "learning_rate": 4.053697183098592e-05,
2283
+ "loss": 1.7795,
2284
+ "step": 6420
2285
+ },
2286
+ {
2287
+ "epoch": 3.8265002970885322,
2288
+ "grad_norm": 5.183379650115967,
2289
+ "learning_rate": 4.031690140845071e-05,
2290
+ "loss": 1.4511,
2291
+ "step": 6440
2292
+ },
2293
+ {
2294
+ "epoch": 3.8383838383838382,
2295
+ "grad_norm": 5.597097873687744,
2296
+ "learning_rate": 4.009683098591549e-05,
2297
+ "loss": 1.2832,
2298
+ "step": 6460
2299
+ },
2300
+ {
2301
+ "epoch": 3.8502673796791442,
2302
+ "grad_norm": 9.230513572692871,
2303
+ "learning_rate": 3.987676056338028e-05,
2304
+ "loss": 1.9395,
2305
+ "step": 6480
2306
+ },
2307
+ {
2308
+ "epoch": 3.8621509209744502,
2309
+ "grad_norm": 7.705165386199951,
2310
+ "learning_rate": 3.9656690140845074e-05,
2311
+ "loss": 1.3734,
2312
+ "step": 6500
2313
+ },
2314
+ {
2315
+ "epoch": 3.8740344622697562,
2316
+ "grad_norm": 6.281782150268555,
2317
+ "learning_rate": 3.943661971830986e-05,
2318
+ "loss": 1.4061,
2319
+ "step": 6520
2320
+ },
2321
+ {
2322
+ "epoch": 3.8859180035650622,
2323
+ "grad_norm": 4.201906204223633,
2324
+ "learning_rate": 3.921654929577465e-05,
2325
+ "loss": 1.49,
2326
+ "step": 6540
2327
+ },
2328
+ {
2329
+ "epoch": 3.8978015448603682,
2330
+ "grad_norm": 8.452786445617676,
2331
+ "learning_rate": 3.899647887323944e-05,
2332
+ "loss": 1.3961,
2333
+ "step": 6560
2334
+ },
2335
+ {
2336
+ "epoch": 3.9096850861556742,
2337
+ "grad_norm": 10.373578071594238,
2338
+ "learning_rate": 3.8776408450704224e-05,
2339
+ "loss": 1.489,
2340
+ "step": 6580
2341
+ },
2342
+ {
2343
+ "epoch": 3.9215686274509802,
2344
+ "grad_norm": 7.816731929779053,
2345
+ "learning_rate": 3.8556338028169015e-05,
2346
+ "loss": 1.3659,
2347
+ "step": 6600
2348
+ },
2349
+ {
2350
+ "epoch": 3.9334521687462862,
2351
+ "grad_norm": 3.9288010597229004,
2352
+ "learning_rate": 3.83362676056338e-05,
2353
+ "loss": 1.7783,
2354
+ "step": 6620
2355
+ },
2356
+ {
2357
+ "epoch": 3.9453357100415927,
2358
+ "grad_norm": 16.12210464477539,
2359
+ "learning_rate": 3.81161971830986e-05,
2360
+ "loss": 1.8105,
2361
+ "step": 6640
2362
+ },
2363
+ {
2364
+ "epoch": 3.9572192513368982,
2365
+ "grad_norm": 3.387218713760376,
2366
+ "learning_rate": 3.789612676056338e-05,
2367
+ "loss": 1.6145,
2368
+ "step": 6660
2369
+ },
2370
+ {
2371
+ "epoch": 3.9691027926322047,
2372
+ "grad_norm": 9.880513191223145,
2373
+ "learning_rate": 3.767605633802817e-05,
2374
+ "loss": 1.7195,
2375
+ "step": 6680
2376
+ },
2377
+ {
2378
+ "epoch": 3.9809863339275102,
2379
+ "grad_norm": 16.724794387817383,
2380
+ "learning_rate": 3.7455985915492956e-05,
2381
+ "loss": 1.8867,
2382
+ "step": 6700
2383
+ },
2384
+ {
2385
+ "epoch": 3.9928698752228167,
2386
+ "grad_norm": 11.043305397033691,
2387
+ "learning_rate": 3.723591549295775e-05,
2388
+ "loss": 1.2336,
2389
+ "step": 6720
2390
+ },
2391
+ {
2392
+ "epoch": 4.0,
2393
+ "eval_accuracy": 0.4074074074074074,
2394
+ "eval_loss": 1.902851939201355,
2395
+ "eval_runtime": 105.2207,
2396
+ "eval_samples_per_second": 2.823,
2397
+ "eval_steps_per_second": 2.823,
2398
+ "step": 6732
2399
+ },
2400
+ {
2401
+ "epoch": 4.004753416518122,
2402
+ "grad_norm": 1.1523293256759644,
2403
+ "learning_rate": 3.701584507042254e-05,
2404
+ "loss": 1.6764,
2405
+ "step": 6740
2406
+ },
2407
+ {
2408
+ "epoch": 4.016636957813429,
2409
+ "grad_norm": 1.856052279472351,
2410
+ "learning_rate": 3.679577464788733e-05,
2411
+ "loss": 2.0626,
2412
+ "step": 6760
2413
+ },
2414
+ {
2415
+ "epoch": 4.028520499108734,
2416
+ "grad_norm": 4.87468147277832,
2417
+ "learning_rate": 3.657570422535211e-05,
2418
+ "loss": 1.3926,
2419
+ "step": 6780
2420
+ },
2421
+ {
2422
+ "epoch": 4.040404040404041,
2423
+ "grad_norm": 8.959086418151855,
2424
+ "learning_rate": 3.6355633802816904e-05,
2425
+ "loss": 1.2396,
2426
+ "step": 6800
2427
+ },
2428
+ {
2429
+ "epoch": 4.052287581699346,
2430
+ "grad_norm": 9.787593841552734,
2431
+ "learning_rate": 3.613556338028169e-05,
2432
+ "loss": 1.2043,
2433
+ "step": 6820
2434
+ },
2435
+ {
2436
+ "epoch": 4.064171122994653,
2437
+ "grad_norm": 20.61239242553711,
2438
+ "learning_rate": 3.5915492957746486e-05,
2439
+ "loss": 1.3016,
2440
+ "step": 6840
2441
+ },
2442
+ {
2443
+ "epoch": 4.076054664289958,
2444
+ "grad_norm": 8.819070816040039,
2445
+ "learning_rate": 3.569542253521127e-05,
2446
+ "loss": 1.3577,
2447
+ "step": 6860
2448
+ },
2449
+ {
2450
+ "epoch": 4.087938205585265,
2451
+ "grad_norm": 25.248939514160156,
2452
+ "learning_rate": 3.547535211267606e-05,
2453
+ "loss": 1.7398,
2454
+ "step": 6880
2455
+ },
2456
+ {
2457
+ "epoch": 4.09982174688057,
2458
+ "grad_norm": 9.243005752563477,
2459
+ "learning_rate": 3.5255281690140845e-05,
2460
+ "loss": 1.1347,
2461
+ "step": 6900
2462
+ },
2463
+ {
2464
+ "epoch": 4.111705288175877,
2465
+ "grad_norm": 9.166324615478516,
2466
+ "learning_rate": 3.5035211267605636e-05,
2467
+ "loss": 1.4357,
2468
+ "step": 6920
2469
+ },
2470
+ {
2471
+ "epoch": 4.123588829471182,
2472
+ "grad_norm": 15.312362670898438,
2473
+ "learning_rate": 3.481514084507043e-05,
2474
+ "loss": 1.0013,
2475
+ "step": 6940
2476
+ },
2477
+ {
2478
+ "epoch": 4.135472370766489,
2479
+ "grad_norm": 2.511606454849243,
2480
+ "learning_rate": 3.459507042253521e-05,
2481
+ "loss": 1.3722,
2482
+ "step": 6960
2483
+ },
2484
+ {
2485
+ "epoch": 4.147355912061794,
2486
+ "grad_norm": 5.319163799285889,
2487
+ "learning_rate": 3.4375e-05,
2488
+ "loss": 1.8293,
2489
+ "step": 6980
2490
+ },
2491
+ {
2492
+ "epoch": 4.159239453357101,
2493
+ "grad_norm": 5.499176979064941,
2494
+ "learning_rate": 3.4154929577464786e-05,
2495
+ "loss": 1.2195,
2496
+ "step": 7000
2497
+ },
2498
+ {
2499
+ "epoch": 4.171122994652406,
2500
+ "grad_norm": 5.21409797668457,
2501
+ "learning_rate": 3.393485915492958e-05,
2502
+ "loss": 1.6016,
2503
+ "step": 7020
2504
+ },
2505
+ {
2506
+ "epoch": 4.183006535947713,
2507
+ "grad_norm": 11.814042091369629,
2508
+ "learning_rate": 3.371478873239437e-05,
2509
+ "loss": 1.1857,
2510
+ "step": 7040
2511
+ },
2512
+ {
2513
+ "epoch": 4.194890077243018,
2514
+ "grad_norm": 10.077415466308594,
2515
+ "learning_rate": 3.349471830985916e-05,
2516
+ "loss": 1.5763,
2517
+ "step": 7060
2518
+ },
2519
+ {
2520
+ "epoch": 4.206773618538325,
2521
+ "grad_norm": 8.21226692199707,
2522
+ "learning_rate": 3.327464788732394e-05,
2523
+ "loss": 1.318,
2524
+ "step": 7080
2525
+ },
2526
+ {
2527
+ "epoch": 4.21865715983363,
2528
+ "grad_norm": 6.55881929397583,
2529
+ "learning_rate": 3.3054577464788734e-05,
2530
+ "loss": 1.5631,
2531
+ "step": 7100
2532
+ },
2533
+ {
2534
+ "epoch": 4.230540701128937,
2535
+ "grad_norm": 3.800264596939087,
2536
+ "learning_rate": 3.283450704225352e-05,
2537
+ "loss": 1.2667,
2538
+ "step": 7120
2539
+ },
2540
+ {
2541
+ "epoch": 4.242424242424242,
2542
+ "grad_norm": 24.056673049926758,
2543
+ "learning_rate": 3.2614436619718316e-05,
2544
+ "loss": 1.9465,
2545
+ "step": 7140
2546
+ },
2547
+ {
2548
+ "epoch": 4.254307783719549,
2549
+ "grad_norm": 15.87082576751709,
2550
+ "learning_rate": 3.23943661971831e-05,
2551
+ "loss": 1.2227,
2552
+ "step": 7160
2553
+ },
2554
+ {
2555
+ "epoch": 4.266191325014854,
2556
+ "grad_norm": 5.343487739562988,
2557
+ "learning_rate": 3.217429577464789e-05,
2558
+ "loss": 1.3855,
2559
+ "step": 7180
2560
+ },
2561
+ {
2562
+ "epoch": 4.278074866310161,
2563
+ "grad_norm": 11.594643592834473,
2564
+ "learning_rate": 3.1954225352112675e-05,
2565
+ "loss": 1.6792,
2566
+ "step": 7200
2567
+ },
2568
+ {
2569
+ "epoch": 4.289958407605466,
2570
+ "grad_norm": 3.1940860748291016,
2571
+ "learning_rate": 3.1734154929577466e-05,
2572
+ "loss": 1.0904,
2573
+ "step": 7220
2574
+ },
2575
+ {
2576
+ "epoch": 4.301841948900773,
2577
+ "grad_norm": 7.972840309143066,
2578
+ "learning_rate": 3.151408450704226e-05,
2579
+ "loss": 1.4883,
2580
+ "step": 7240
2581
+ },
2582
+ {
2583
+ "epoch": 4.313725490196078,
2584
+ "grad_norm": 7.334961891174316,
2585
+ "learning_rate": 3.129401408450705e-05,
2586
+ "loss": 1.8654,
2587
+ "step": 7260
2588
+ },
2589
+ {
2590
+ "epoch": 4.325609031491385,
2591
+ "grad_norm": 4.948212623596191,
2592
+ "learning_rate": 3.107394366197183e-05,
2593
+ "loss": 1.4228,
2594
+ "step": 7280
2595
+ },
2596
+ {
2597
+ "epoch": 4.33749257278669,
2598
+ "grad_norm": 18.957931518554688,
2599
+ "learning_rate": 3.085387323943662e-05,
2600
+ "loss": 1.4826,
2601
+ "step": 7300
2602
+ },
2603
+ {
2604
+ "epoch": 4.349376114081997,
2605
+ "grad_norm": 20.902833938598633,
2606
+ "learning_rate": 3.063380281690141e-05,
2607
+ "loss": 1.2242,
2608
+ "step": 7320
2609
+ },
2610
+ {
2611
+ "epoch": 4.361259655377302,
2612
+ "grad_norm": 25.95677375793457,
2613
+ "learning_rate": 3.04137323943662e-05,
2614
+ "loss": 0.779,
2615
+ "step": 7340
2616
+ },
2617
+ {
2618
+ "epoch": 4.373143196672609,
2619
+ "grad_norm": 10.477136611938477,
2620
+ "learning_rate": 3.019366197183099e-05,
2621
+ "loss": 1.1637,
2622
+ "step": 7360
2623
+ },
2624
+ {
2625
+ "epoch": 4.385026737967914,
2626
+ "grad_norm": 7.310263633728027,
2627
+ "learning_rate": 2.9973591549295776e-05,
2628
+ "loss": 1.6564,
2629
+ "step": 7380
2630
+ },
2631
+ {
2632
+ "epoch": 4.396910279263221,
2633
+ "grad_norm": 48.1497688293457,
2634
+ "learning_rate": 2.9753521126760564e-05,
2635
+ "loss": 2.1001,
2636
+ "step": 7400
2637
+ },
2638
+ {
2639
+ "epoch": 4.408793820558526,
2640
+ "grad_norm": 11.60807991027832,
2641
+ "learning_rate": 2.953345070422535e-05,
2642
+ "loss": 1.7998,
2643
+ "step": 7420
2644
+ },
2645
+ {
2646
+ "epoch": 4.420677361853833,
2647
+ "grad_norm": 3.019927501678467,
2648
+ "learning_rate": 2.9313380281690146e-05,
2649
+ "loss": 1.2465,
2650
+ "step": 7440
2651
+ },
2652
+ {
2653
+ "epoch": 4.432560903149138,
2654
+ "grad_norm": 10.465686798095703,
2655
+ "learning_rate": 2.9093309859154933e-05,
2656
+ "loss": 1.1023,
2657
+ "step": 7460
2658
+ },
2659
+ {
2660
+ "epoch": 4.444444444444445,
2661
+ "grad_norm": 6.366426467895508,
2662
+ "learning_rate": 2.887323943661972e-05,
2663
+ "loss": 1.0901,
2664
+ "step": 7480
2665
+ },
2666
+ {
2667
+ "epoch": 4.45632798573975,
2668
+ "grad_norm": 20.92388343811035,
2669
+ "learning_rate": 2.865316901408451e-05,
2670
+ "loss": 1.2325,
2671
+ "step": 7500
2672
+ },
2673
+ {
2674
+ "epoch": 4.468211527035057,
2675
+ "grad_norm": 17.193758010864258,
2676
+ "learning_rate": 2.8433098591549296e-05,
2677
+ "loss": 1.3298,
2678
+ "step": 7520
2679
+ },
2680
+ {
2681
+ "epoch": 4.480095068330362,
2682
+ "grad_norm": 5.884913921356201,
2683
+ "learning_rate": 2.8213028169014087e-05,
2684
+ "loss": 1.2775,
2685
+ "step": 7540
2686
+ },
2687
+ {
2688
+ "epoch": 4.491978609625669,
2689
+ "grad_norm": 6.033341407775879,
2690
+ "learning_rate": 2.7992957746478874e-05,
2691
+ "loss": 1.4184,
2692
+ "step": 7560
2693
+ },
2694
+ {
2695
+ "epoch": 4.503862150920974,
2696
+ "grad_norm": 44.68092727661133,
2697
+ "learning_rate": 2.7772887323943665e-05,
2698
+ "loss": 1.4361,
2699
+ "step": 7580
2700
+ },
2701
+ {
2702
+ "epoch": 4.515745692216281,
2703
+ "grad_norm": 11.81906795501709,
2704
+ "learning_rate": 2.7552816901408453e-05,
2705
+ "loss": 1.578,
2706
+ "step": 7600
2707
+ },
2708
+ {
2709
+ "epoch": 4.527629233511586,
2710
+ "grad_norm": 12.85550594329834,
2711
+ "learning_rate": 2.733274647887324e-05,
2712
+ "loss": 1.439,
2713
+ "step": 7620
2714
+ },
2715
+ {
2716
+ "epoch": 4.539512774806893,
2717
+ "grad_norm": 5.915256977081299,
2718
+ "learning_rate": 2.711267605633803e-05,
2719
+ "loss": 1.4547,
2720
+ "step": 7640
2721
+ },
2722
+ {
2723
+ "epoch": 4.551396316102198,
2724
+ "grad_norm": 12.035588264465332,
2725
+ "learning_rate": 2.689260563380282e-05,
2726
+ "loss": 1.3403,
2727
+ "step": 7660
2728
+ },
2729
+ {
2730
+ "epoch": 4.563279857397505,
2731
+ "grad_norm": 6.767269611358643,
2732
+ "learning_rate": 2.6672535211267606e-05,
2733
+ "loss": 1.1761,
2734
+ "step": 7680
2735
+ },
2736
+ {
2737
+ "epoch": 4.57516339869281,
2738
+ "grad_norm": 10.033326148986816,
2739
+ "learning_rate": 2.6452464788732394e-05,
2740
+ "loss": 1.352,
2741
+ "step": 7700
2742
+ },
2743
+ {
2744
+ "epoch": 4.587046939988117,
2745
+ "grad_norm": 11.945486068725586,
2746
+ "learning_rate": 2.623239436619718e-05,
2747
+ "loss": 1.5271,
2748
+ "step": 7720
2749
+ },
2750
+ {
2751
+ "epoch": 4.598930481283422,
2752
+ "grad_norm": 15.092402458190918,
2753
+ "learning_rate": 2.601232394366197e-05,
2754
+ "loss": 1.3031,
2755
+ "step": 7740
2756
+ },
2757
+ {
2758
+ "epoch": 4.610814022578729,
2759
+ "grad_norm": 9.422025680541992,
2760
+ "learning_rate": 2.5792253521126763e-05,
2761
+ "loss": 1.2642,
2762
+ "step": 7760
2763
+ },
2764
+ {
2765
+ "epoch": 4.622697563874034,
2766
+ "grad_norm": 6.706630706787109,
2767
+ "learning_rate": 2.557218309859155e-05,
2768
+ "loss": 1.4985,
2769
+ "step": 7780
2770
+ },
2771
+ {
2772
+ "epoch": 4.634581105169341,
2773
+ "grad_norm": 11.070060729980469,
2774
+ "learning_rate": 2.535211267605634e-05,
2775
+ "loss": 1.3693,
2776
+ "step": 7800
2777
+ },
2778
+ {
2779
+ "epoch": 4.646464646464646,
2780
+ "grad_norm": 21.464950561523438,
2781
+ "learning_rate": 2.5132042253521126e-05,
2782
+ "loss": 1.2615,
2783
+ "step": 7820
2784
+ },
2785
+ {
2786
+ "epoch": 4.658348187759953,
2787
+ "grad_norm": 0.3174152076244354,
2788
+ "learning_rate": 2.4911971830985917e-05,
2789
+ "loss": 1.7061,
2790
+ "step": 7840
2791
+ },
2792
+ {
2793
+ "epoch": 4.670231729055258,
2794
+ "grad_norm": 10.314111709594727,
2795
+ "learning_rate": 2.4691901408450704e-05,
2796
+ "loss": 1.2515,
2797
+ "step": 7860
2798
+ },
2799
+ {
2800
+ "epoch": 4.682115270350565,
2801
+ "grad_norm": 1.4397400617599487,
2802
+ "learning_rate": 2.4471830985915495e-05,
2803
+ "loss": 1.4254,
2804
+ "step": 7880
2805
+ },
2806
+ {
2807
+ "epoch": 4.69399881164587,
2808
+ "grad_norm": 11.450037002563477,
2809
+ "learning_rate": 2.4251760563380283e-05,
2810
+ "loss": 1.1362,
2811
+ "step": 7900
2812
+ },
2813
+ {
2814
+ "epoch": 4.705882352941177,
2815
+ "grad_norm": 17.582212448120117,
2816
+ "learning_rate": 2.4031690140845074e-05,
2817
+ "loss": 1.6171,
2818
+ "step": 7920
2819
+ },
2820
+ {
2821
+ "epoch": 4.717765894236482,
2822
+ "grad_norm": 0.15067483484745026,
2823
+ "learning_rate": 2.381161971830986e-05,
2824
+ "loss": 1.0266,
2825
+ "step": 7940
2826
+ },
2827
+ {
2828
+ "epoch": 4.729649435531789,
2829
+ "grad_norm": 0.35611966252326965,
2830
+ "learning_rate": 2.359154929577465e-05,
2831
+ "loss": 1.8023,
2832
+ "step": 7960
2833
+ },
2834
+ {
2835
+ "epoch": 4.741532976827094,
2836
+ "grad_norm": 10.370597839355469,
2837
+ "learning_rate": 2.337147887323944e-05,
2838
+ "loss": 1.8925,
2839
+ "step": 7980
2840
+ },
2841
+ {
2842
+ "epoch": 4.753416518122401,
2843
+ "grad_norm": 5.118237495422363,
2844
+ "learning_rate": 2.3151408450704227e-05,
2845
+ "loss": 1.2258,
2846
+ "step": 8000
2847
+ },
2848
+ {
2849
+ "epoch": 4.765300059417706,
2850
+ "grad_norm": 29.316865921020508,
2851
+ "learning_rate": 2.2931338028169015e-05,
2852
+ "loss": 1.0854,
2853
+ "step": 8020
2854
+ },
2855
+ {
2856
+ "epoch": 4.777183600713013,
2857
+ "grad_norm": 12.539017677307129,
2858
+ "learning_rate": 2.2711267605633806e-05,
2859
+ "loss": 1.529,
2860
+ "step": 8040
2861
+ },
2862
+ {
2863
+ "epoch": 4.789067142008318,
2864
+ "grad_norm": 16.720726013183594,
2865
+ "learning_rate": 2.2491197183098593e-05,
2866
+ "loss": 1.8251,
2867
+ "step": 8060
2868
+ },
2869
+ {
2870
+ "epoch": 4.800950683303625,
2871
+ "grad_norm": 16.409805297851562,
2872
+ "learning_rate": 2.227112676056338e-05,
2873
+ "loss": 1.0862,
2874
+ "step": 8080
2875
+ },
2876
+ {
2877
+ "epoch": 4.81283422459893,
2878
+ "grad_norm": 18.887989044189453,
2879
+ "learning_rate": 2.205105633802817e-05,
2880
+ "loss": 1.2557,
2881
+ "step": 8100
2882
+ },
2883
+ {
2884
+ "epoch": 4.824717765894237,
2885
+ "grad_norm": 2.1975255012512207,
2886
+ "learning_rate": 2.1830985915492956e-05,
2887
+ "loss": 1.2347,
2888
+ "step": 8120
2889
+ },
2890
+ {
2891
+ "epoch": 4.836601307189542,
2892
+ "grad_norm": 0.16356885433197021,
2893
+ "learning_rate": 2.1610915492957747e-05,
2894
+ "loss": 1.2155,
2895
+ "step": 8140
2896
+ },
2897
+ {
2898
+ "epoch": 4.848484848484849,
2899
+ "grad_norm": 27.676633834838867,
2900
+ "learning_rate": 2.1390845070422534e-05,
2901
+ "loss": 2.5601,
2902
+ "step": 8160
2903
+ },
2904
+ {
2905
+ "epoch": 4.860368389780154,
2906
+ "grad_norm": 6.721032619476318,
2907
+ "learning_rate": 2.1170774647887325e-05,
2908
+ "loss": 1.0707,
2909
+ "step": 8180
2910
+ },
2911
+ {
2912
+ "epoch": 4.872251931075461,
2913
+ "grad_norm": 17.30257225036621,
2914
+ "learning_rate": 2.0950704225352113e-05,
2915
+ "loss": 1.305,
2916
+ "step": 8200
2917
+ },
2918
+ {
2919
+ "epoch": 4.884135472370766,
2920
+ "grad_norm": 12.832918167114258,
2921
+ "learning_rate": 2.07306338028169e-05,
2922
+ "loss": 1.2079,
2923
+ "step": 8220
2924
+ },
2925
+ {
2926
+ "epoch": 4.896019013666073,
2927
+ "grad_norm": 25.992534637451172,
2928
+ "learning_rate": 2.051056338028169e-05,
2929
+ "loss": 1.6578,
2930
+ "step": 8240
2931
+ },
2932
+ {
2933
+ "epoch": 4.907902554961378,
2934
+ "grad_norm": 21.847524642944336,
2935
+ "learning_rate": 2.029049295774648e-05,
2936
+ "loss": 1.221,
2937
+ "step": 8260
2938
+ },
2939
+ {
2940
+ "epoch": 4.919786096256685,
2941
+ "grad_norm": 15.508858680725098,
2942
+ "learning_rate": 2.007042253521127e-05,
2943
+ "loss": 1.4104,
2944
+ "step": 8280
2945
+ },
2946
+ {
2947
+ "epoch": 4.93166963755199,
2948
+ "grad_norm": 16.669252395629883,
2949
+ "learning_rate": 1.9850352112676057e-05,
2950
+ "loss": 1.6439,
2951
+ "step": 8300
2952
+ },
2953
+ {
2954
+ "epoch": 4.943553178847297,
2955
+ "grad_norm": 11.705305099487305,
2956
+ "learning_rate": 1.9630281690140845e-05,
2957
+ "loss": 1.1035,
2958
+ "step": 8320
2959
+ },
2960
+ {
2961
+ "epoch": 4.955436720142602,
2962
+ "grad_norm": 6.209210395812988,
2963
+ "learning_rate": 1.9410211267605636e-05,
2964
+ "loss": 1.0942,
2965
+ "step": 8340
2966
+ },
2967
+ {
2968
+ "epoch": 4.967320261437909,
2969
+ "grad_norm": 28.53778839111328,
2970
+ "learning_rate": 1.9190140845070423e-05,
2971
+ "loss": 1.3909,
2972
+ "step": 8360
2973
+ },
2974
+ {
2975
+ "epoch": 4.979203802733214,
2976
+ "grad_norm": 16.432416915893555,
2977
+ "learning_rate": 1.8970070422535214e-05,
2978
+ "loss": 1.7145,
2979
+ "step": 8380
2980
+ },
2981
+ {
2982
+ "epoch": 4.991087344028521,
2983
+ "grad_norm": 50.011844635009766,
2984
+ "learning_rate": 1.8750000000000002e-05,
2985
+ "loss": 1.5421,
2986
+ "step": 8400
2987
+ },
2988
+ {
2989
+ "epoch": 5.0,
2990
+ "eval_accuracy": 0.39730639730639733,
2991
+ "eval_loss": 2.0971696376800537,
2992
+ "eval_runtime": 105.4359,
2993
+ "eval_samples_per_second": 2.817,
2994
+ "eval_steps_per_second": 2.817,
2995
+ "step": 8415
2996
+ },
2997
+ {
2998
+ "epoch": 5.002970885323826,
2999
+ "grad_norm": 0.27923718094825745,
3000
+ "learning_rate": 1.852992957746479e-05,
3001
+ "loss": 0.9886,
3002
+ "step": 8420
3003
+ },
3004
+ {
3005
+ "epoch": 5.014854426619133,
3006
+ "grad_norm": 14.41185188293457,
3007
+ "learning_rate": 1.830985915492958e-05,
3008
+ "loss": 1.1069,
3009
+ "step": 8440
3010
+ },
3011
+ {
3012
+ "epoch": 5.026737967914438,
3013
+ "grad_norm": 10.032986640930176,
3014
+ "learning_rate": 1.8089788732394368e-05,
3015
+ "loss": 1.3234,
3016
+ "step": 8460
3017
+ },
3018
+ {
3019
+ "epoch": 5.038621509209745,
3020
+ "grad_norm": 0.9974661469459534,
3021
+ "learning_rate": 1.7869718309859155e-05,
3022
+ "loss": 0.8732,
3023
+ "step": 8480
3024
+ },
3025
+ {
3026
+ "epoch": 5.05050505050505,
3027
+ "grad_norm": 8.552937507629395,
3028
+ "learning_rate": 1.7649647887323946e-05,
3029
+ "loss": 1.3454,
3030
+ "step": 8500
3031
+ },
3032
+ {
3033
+ "epoch": 5.062388591800357,
3034
+ "grad_norm": 1.974118709564209,
3035
+ "learning_rate": 1.7440580985915494e-05,
3036
+ "loss": 1.6707,
3037
+ "step": 8520
3038
+ },
3039
+ {
3040
+ "epoch": 5.074272133095662,
3041
+ "grad_norm": 3.7020468711853027,
3042
+ "learning_rate": 1.722051056338028e-05,
3043
+ "loss": 1.6472,
3044
+ "step": 8540
3045
+ },
3046
+ {
3047
+ "epoch": 5.086155674390969,
3048
+ "grad_norm": 19.44769287109375,
3049
+ "learning_rate": 1.7000440140845072e-05,
3050
+ "loss": 1.2964,
3051
+ "step": 8560
3052
+ },
3053
+ {
3054
+ "epoch": 5.098039215686274,
3055
+ "grad_norm": 9.980234146118164,
3056
+ "learning_rate": 1.678036971830986e-05,
3057
+ "loss": 1.2214,
3058
+ "step": 8580
3059
+ },
3060
+ {
3061
+ "epoch": 5.109922756981581,
3062
+ "grad_norm": 12.875494003295898,
3063
+ "learning_rate": 1.6560299295774647e-05,
3064
+ "loss": 1.2813,
3065
+ "step": 8600
3066
+ },
3067
+ {
3068
+ "epoch": 5.121806298276886,
3069
+ "grad_norm": 10.326569557189941,
3070
+ "learning_rate": 1.6340228873239438e-05,
3071
+ "loss": 1.6121,
3072
+ "step": 8620
3073
+ },
3074
+ {
3075
+ "epoch": 5.133689839572193,
3076
+ "grad_norm": 4.770064353942871,
3077
+ "learning_rate": 1.6120158450704226e-05,
3078
+ "loss": 1.3623,
3079
+ "step": 8640
3080
+ },
3081
+ {
3082
+ "epoch": 5.145573380867498,
3083
+ "grad_norm": 7.65989351272583,
3084
+ "learning_rate": 1.5911091549295777e-05,
3085
+ "loss": 1.4138,
3086
+ "step": 8660
3087
+ },
3088
+ {
3089
+ "epoch": 5.157456922162805,
3090
+ "grad_norm": 15.983546257019043,
3091
+ "learning_rate": 1.5691021126760564e-05,
3092
+ "loss": 1.9227,
3093
+ "step": 8680
3094
+ },
3095
+ {
3096
+ "epoch": 5.16934046345811,
3097
+ "grad_norm": 12.394526481628418,
3098
+ "learning_rate": 1.5470950704225355e-05,
3099
+ "loss": 1.3912,
3100
+ "step": 8700
3101
+ },
3102
+ {
3103
+ "epoch": 5.181224004753417,
3104
+ "grad_norm": 9.449789047241211,
3105
+ "learning_rate": 1.5250880281690141e-05,
3106
+ "loss": 1.0565,
3107
+ "step": 8720
3108
+ },
3109
+ {
3110
+ "epoch": 5.193107546048722,
3111
+ "grad_norm": 2.6865243911743164,
3112
+ "learning_rate": 1.5030809859154929e-05,
3113
+ "loss": 1.3442,
3114
+ "step": 8740
3115
+ },
3116
+ {
3117
+ "epoch": 5.204991087344029,
3118
+ "grad_norm": 4.256808280944824,
3119
+ "learning_rate": 1.481073943661972e-05,
3120
+ "loss": 1.1088,
3121
+ "step": 8760
3122
+ },
3123
+ {
3124
+ "epoch": 5.216874628639334,
3125
+ "grad_norm": 25.540273666381836,
3126
+ "learning_rate": 1.4590669014084507e-05,
3127
+ "loss": 1.4719,
3128
+ "step": 8780
3129
+ },
3130
+ {
3131
+ "epoch": 5.228758169934641,
3132
+ "grad_norm": 16.564638137817383,
3133
+ "learning_rate": 1.4370598591549298e-05,
3134
+ "loss": 1.5104,
3135
+ "step": 8800
3136
+ },
3137
+ {
3138
+ "epoch": 5.240641711229946,
3139
+ "grad_norm": 16.423961639404297,
3140
+ "learning_rate": 1.4150528169014086e-05,
3141
+ "loss": 1.324,
3142
+ "step": 8820
3143
+ },
3144
+ {
3145
+ "epoch": 5.252525252525253,
3146
+ "grad_norm": 7.4870805740356445,
3147
+ "learning_rate": 1.3930457746478873e-05,
3148
+ "loss": 1.4307,
3149
+ "step": 8840
3150
+ },
3151
+ {
3152
+ "epoch": 5.264408793820558,
3153
+ "grad_norm": 1.6392219066619873,
3154
+ "learning_rate": 1.3710387323943662e-05,
3155
+ "loss": 1.4722,
3156
+ "step": 8860
3157
+ },
3158
+ {
3159
+ "epoch": 5.276292335115865,
3160
+ "grad_norm": 10.110002517700195,
3161
+ "learning_rate": 1.349031690140845e-05,
3162
+ "loss": 1.9203,
3163
+ "step": 8880
3164
+ },
3165
+ {
3166
+ "epoch": 5.28817587641117,
3167
+ "grad_norm": 0.26278719305992126,
3168
+ "learning_rate": 1.327024647887324e-05,
3169
+ "loss": 1.4466,
3170
+ "step": 8900
3171
+ },
3172
+ {
3173
+ "epoch": 5.300059417706477,
3174
+ "grad_norm": 13.663880348205566,
3175
+ "learning_rate": 1.3050176056338028e-05,
3176
+ "loss": 1.1327,
3177
+ "step": 8920
3178
+ },
3179
+ {
3180
+ "epoch": 5.311942959001782,
3181
+ "grad_norm": 17.244321823120117,
3182
+ "learning_rate": 1.2830105633802816e-05,
3183
+ "loss": 1.1933,
3184
+ "step": 8940
3185
+ },
3186
+ {
3187
+ "epoch": 5.323826500297089,
3188
+ "grad_norm": 6.0645341873168945,
3189
+ "learning_rate": 1.2610035211267607e-05,
3190
+ "loss": 1.1798,
3191
+ "step": 8960
3192
+ },
3193
+ {
3194
+ "epoch": 5.335710041592394,
3195
+ "grad_norm": 2.7921664714813232,
3196
+ "learning_rate": 1.2389964788732394e-05,
3197
+ "loss": 1.1191,
3198
+ "step": 8980
3199
+ },
3200
+ {
3201
+ "epoch": 5.347593582887701,
3202
+ "grad_norm": 3.2458791732788086,
3203
+ "learning_rate": 1.2169894366197184e-05,
3204
+ "loss": 1.2999,
3205
+ "step": 9000
3206
+ },
3207
+ {
3208
+ "epoch": 5.359477124183006,
3209
+ "grad_norm": 0.3115676939487457,
3210
+ "learning_rate": 1.1949823943661973e-05,
3211
+ "loss": 1.4403,
3212
+ "step": 9020
3213
+ },
3214
+ {
3215
+ "epoch": 5.371360665478313,
3216
+ "grad_norm": 17.281970977783203,
3217
+ "learning_rate": 1.1729753521126762e-05,
3218
+ "loss": 1.4465,
3219
+ "step": 9040
3220
+ },
3221
+ {
3222
+ "epoch": 5.383244206773618,
3223
+ "grad_norm": 5.0808539390563965,
3224
+ "learning_rate": 1.1509683098591551e-05,
3225
+ "loss": 1.2003,
3226
+ "step": 9060
3227
+ },
3228
+ {
3229
+ "epoch": 5.395127748068925,
3230
+ "grad_norm": 0.5730569958686829,
3231
+ "learning_rate": 1.1289612676056339e-05,
3232
+ "loss": 1.3958,
3233
+ "step": 9080
3234
+ },
3235
+ {
3236
+ "epoch": 5.40701128936423,
3237
+ "grad_norm": 5.8343424797058105,
3238
+ "learning_rate": 1.1069542253521126e-05,
3239
+ "loss": 1.4439,
3240
+ "step": 9100
3241
+ },
3242
+ {
3243
+ "epoch": 5.418894830659537,
3244
+ "grad_norm": 14.066546440124512,
3245
+ "learning_rate": 1.0849471830985916e-05,
3246
+ "loss": 1.4107,
3247
+ "step": 9120
3248
+ },
3249
+ {
3250
+ "epoch": 5.430778371954842,
3251
+ "grad_norm": 2.0192813873291016,
3252
+ "learning_rate": 1.0629401408450705e-05,
3253
+ "loss": 1.2033,
3254
+ "step": 9140
3255
+ },
3256
+ {
3257
+ "epoch": 5.442661913250149,
3258
+ "grad_norm": 24.278059005737305,
3259
+ "learning_rate": 1.0409330985915492e-05,
3260
+ "loss": 1.4415,
3261
+ "step": 9160
3262
+ },
3263
+ {
3264
+ "epoch": 5.454545454545454,
3265
+ "grad_norm": 45.362178802490234,
3266
+ "learning_rate": 1.0189260563380282e-05,
3267
+ "loss": 1.6362,
3268
+ "step": 9180
3269
+ },
3270
+ {
3271
+ "epoch": 5.466428995840761,
3272
+ "grad_norm": 0.005078072659671307,
3273
+ "learning_rate": 9.96919014084507e-06,
3274
+ "loss": 1.4426,
3275
+ "step": 9200
3276
+ },
3277
+ {
3278
+ "epoch": 5.478312537136066,
3279
+ "grad_norm": 1.114734172821045,
3280
+ "learning_rate": 9.74911971830986e-06,
3281
+ "loss": 1.2417,
3282
+ "step": 9220
3283
+ },
3284
+ {
3285
+ "epoch": 5.490196078431373,
3286
+ "grad_norm": 5.923354625701904,
3287
+ "learning_rate": 9.52904929577465e-06,
3288
+ "loss": 1.2631,
3289
+ "step": 9240
3290
+ },
3291
+ {
3292
+ "epoch": 5.502079619726679,
3293
+ "grad_norm": 19.652738571166992,
3294
+ "learning_rate": 9.308978873239437e-06,
3295
+ "loss": 0.9863,
3296
+ "step": 9260
3297
+ },
3298
+ {
3299
+ "epoch": 5.513963161021985,
3300
+ "grad_norm": 7.192227363586426,
3301
+ "learning_rate": 9.088908450704226e-06,
3302
+ "loss": 0.7249,
3303
+ "step": 9280
3304
+ },
3305
+ {
3306
+ "epoch": 5.52584670231729,
3307
+ "grad_norm": 0.43900468945503235,
3308
+ "learning_rate": 8.868838028169015e-06,
3309
+ "loss": 1.2102,
3310
+ "step": 9300
3311
+ },
3312
+ {
3313
+ "epoch": 5.537730243612597,
3314
+ "grad_norm": 4.792228698730469,
3315
+ "learning_rate": 8.648767605633803e-06,
3316
+ "loss": 1.4583,
3317
+ "step": 9320
3318
+ },
3319
+ {
3320
+ "epoch": 5.549613784907903,
3321
+ "grad_norm": 4.418882846832275,
3322
+ "learning_rate": 8.428697183098592e-06,
3323
+ "loss": 1.427,
3324
+ "step": 9340
3325
+ },
3326
+ {
3327
+ "epoch": 5.561497326203209,
3328
+ "grad_norm": 6.352531909942627,
3329
+ "learning_rate": 8.20862676056338e-06,
3330
+ "loss": 1.4876,
3331
+ "step": 9360
3332
+ },
3333
+ {
3334
+ "epoch": 5.573380867498514,
3335
+ "grad_norm": 5.210249900817871,
3336
+ "learning_rate": 7.988556338028169e-06,
3337
+ "loss": 0.7589,
3338
+ "step": 9380
3339
+ },
3340
+ {
3341
+ "epoch": 5.585264408793821,
3342
+ "grad_norm": 5.8901543617248535,
3343
+ "learning_rate": 7.768485915492958e-06,
3344
+ "loss": 0.8805,
3345
+ "step": 9400
3346
+ },
3347
+ {
3348
+ "epoch": 5.597147950089127,
3349
+ "grad_norm": 24.47085189819336,
3350
+ "learning_rate": 7.548415492957747e-06,
3351
+ "loss": 1.5747,
3352
+ "step": 9420
3353
+ },
3354
+ {
3355
+ "epoch": 5.609031491384433,
3356
+ "grad_norm": 16.86648178100586,
3357
+ "learning_rate": 7.3283450704225365e-06,
3358
+ "loss": 1.3989,
3359
+ "step": 9440
3360
+ },
3361
+ {
3362
+ "epoch": 5.620915032679738,
3363
+ "grad_norm": 6.325050354003906,
3364
+ "learning_rate": 7.108274647887324e-06,
3365
+ "loss": 1.2151,
3366
+ "step": 9460
3367
+ },
3368
+ {
3369
+ "epoch": 5.632798573975045,
3370
+ "grad_norm": 8.903736114501953,
3371
+ "learning_rate": 6.888204225352113e-06,
3372
+ "loss": 1.2345,
3373
+ "step": 9480
3374
+ },
3375
+ {
3376
+ "epoch": 5.644682115270351,
3377
+ "grad_norm": 9.477288246154785,
3378
+ "learning_rate": 6.668133802816902e-06,
3379
+ "loss": 1.6694,
3380
+ "step": 9500
3381
+ },
3382
+ {
3383
+ "epoch": 5.656565656565657,
3384
+ "grad_norm": 79.3915023803711,
3385
+ "learning_rate": 6.448063380281691e-06,
3386
+ "loss": 1.4245,
3387
+ "step": 9520
3388
+ },
3389
+ {
3390
+ "epoch": 5.668449197860962,
3391
+ "grad_norm": 12.133055686950684,
3392
+ "learning_rate": 6.227992957746479e-06,
3393
+ "loss": 1.2977,
3394
+ "step": 9540
3395
+ },
3396
+ {
3397
+ "epoch": 5.680332739156269,
3398
+ "grad_norm": 4.151798248291016,
3399
+ "learning_rate": 6.0079225352112685e-06,
3400
+ "loss": 1.2221,
3401
+ "step": 9560
3402
+ },
3403
+ {
3404
+ "epoch": 5.692216280451575,
3405
+ "grad_norm": 4.959856033325195,
3406
+ "learning_rate": 5.787852112676057e-06,
3407
+ "loss": 1.1806,
3408
+ "step": 9580
3409
+ },
3410
+ {
3411
+ "epoch": 5.704099821746881,
3412
+ "grad_norm": 5.8293633460998535,
3413
+ "learning_rate": 5.567781690140845e-06,
3414
+ "loss": 1.8405,
3415
+ "step": 9600
3416
+ },
3417
+ {
3418
+ "epoch": 5.715983363042186,
3419
+ "grad_norm": 0.5208277106285095,
3420
+ "learning_rate": 5.347711267605634e-06,
3421
+ "loss": 1.0997,
3422
+ "step": 9620
3423
+ },
3424
+ {
3425
+ "epoch": 5.727866904337493,
3426
+ "grad_norm": 0.5993483066558838,
3427
+ "learning_rate": 5.127640845070423e-06,
3428
+ "loss": 1.3571,
3429
+ "step": 9640
3430
+ },
3431
+ {
3432
+ "epoch": 5.739750445632799,
3433
+ "grad_norm": 6.959007263183594,
3434
+ "learning_rate": 4.907570422535211e-06,
3435
+ "loss": 1.5781,
3436
+ "step": 9660
3437
+ },
3438
+ {
3439
+ "epoch": 5.751633986928105,
3440
+ "grad_norm": 11.25479507446289,
3441
+ "learning_rate": 4.6875000000000004e-06,
3442
+ "loss": 1.4159,
3443
+ "step": 9680
3444
+ },
3445
+ {
3446
+ "epoch": 5.76351752822341,
3447
+ "grad_norm": 24.656896591186523,
3448
+ "learning_rate": 4.467429577464789e-06,
3449
+ "loss": 0.9139,
3450
+ "step": 9700
3451
+ },
3452
+ {
3453
+ "epoch": 5.775401069518717,
3454
+ "grad_norm": 10.791423797607422,
3455
+ "learning_rate": 4.247359154929577e-06,
3456
+ "loss": 1.4309,
3457
+ "step": 9720
3458
+ },
3459
+ {
3460
+ "epoch": 5.787284610814023,
3461
+ "grad_norm": 5.377380847930908,
3462
+ "learning_rate": 4.0272887323943664e-06,
3463
+ "loss": 1.6518,
3464
+ "step": 9740
3465
+ },
3466
+ {
3467
+ "epoch": 5.799168152109329,
3468
+ "grad_norm": 17.60517120361328,
3469
+ "learning_rate": 3.807218309859155e-06,
3470
+ "loss": 1.4515,
3471
+ "step": 9760
3472
+ },
3473
+ {
3474
+ "epoch": 5.811051693404634,
3475
+ "grad_norm": 10.215800285339355,
3476
+ "learning_rate": 3.587147887323944e-06,
3477
+ "loss": 0.8738,
3478
+ "step": 9780
3479
+ },
3480
+ {
3481
+ "epoch": 5.822935234699941,
3482
+ "grad_norm": 19.97536277770996,
3483
+ "learning_rate": 3.3670774647887324e-06,
3484
+ "loss": 1.5413,
3485
+ "step": 9800
3486
+ },
3487
+ {
3488
+ "epoch": 5.834818775995247,
3489
+ "grad_norm": 6.600990295410156,
3490
+ "learning_rate": 3.1470070422535212e-06,
3491
+ "loss": 1.3498,
3492
+ "step": 9820
3493
+ },
3494
+ {
3495
+ "epoch": 5.846702317290553,
3496
+ "grad_norm": 6.998388767242432,
3497
+ "learning_rate": 2.92693661971831e-06,
3498
+ "loss": 1.1162,
3499
+ "step": 9840
3500
+ },
3501
+ {
3502
+ "epoch": 5.858585858585858,
3503
+ "grad_norm": 9.804807662963867,
3504
+ "learning_rate": 2.7068661971830984e-06,
3505
+ "loss": 1.5677,
3506
+ "step": 9860
3507
+ },
3508
+ {
3509
+ "epoch": 5.870469399881165,
3510
+ "grad_norm": 7.46671724319458,
3511
+ "learning_rate": 2.4867957746478876e-06,
3512
+ "loss": 1.1665,
3513
+ "step": 9880
3514
+ },
3515
+ {
3516
+ "epoch": 5.882352941176471,
3517
+ "grad_norm": 10.575834274291992,
3518
+ "learning_rate": 2.2667253521126764e-06,
3519
+ "loss": 1.2868,
3520
+ "step": 9900
3521
+ },
3522
+ {
3523
+ "epoch": 5.894236482471777,
3524
+ "grad_norm": 15.221187591552734,
3525
+ "learning_rate": 2.046654929577465e-06,
3526
+ "loss": 1.1183,
3527
+ "step": 9920
3528
+ },
3529
+ {
3530
+ "epoch": 5.906120023767082,
3531
+ "grad_norm": 6.444254398345947,
3532
+ "learning_rate": 1.8265845070422536e-06,
3533
+ "loss": 1.6049,
3534
+ "step": 9940
3535
+ },
3536
+ {
3537
+ "epoch": 5.918003565062389,
3538
+ "grad_norm": 5.689063549041748,
3539
+ "learning_rate": 1.6065140845070422e-06,
3540
+ "loss": 1.0063,
3541
+ "step": 9960
3542
+ },
3543
+ {
3544
+ "epoch": 5.929887106357695,
3545
+ "grad_norm": 7.111664295196533,
3546
+ "learning_rate": 1.386443661971831e-06,
3547
+ "loss": 0.9414,
3548
+ "step": 9980
3549
+ },
3550
+ {
3551
+ "epoch": 5.941770647653001,
3552
+ "grad_norm": 37.21208953857422,
3553
+ "learning_rate": 1.1663732394366198e-06,
3554
+ "loss": 1.4099,
3555
+ "step": 10000
3556
+ },
3557
+ {
3558
+ "epoch": 5.953654188948306,
3559
+ "grad_norm": 3.9379427433013916,
3560
+ "learning_rate": 9.463028169014084e-07,
3561
+ "loss": 1.1238,
3562
+ "step": 10020
3563
+ },
3564
+ {
3565
+ "epoch": 5.965537730243613,
3566
+ "grad_norm": 2.1166274547576904,
3567
+ "learning_rate": 7.262323943661972e-07,
3568
+ "loss": 1.2788,
3569
+ "step": 10040
3570
+ },
3571
+ {
3572
+ "epoch": 5.977421271538919,
3573
+ "grad_norm": 7.178572654724121,
3574
+ "learning_rate": 5.061619718309859e-07,
3575
+ "loss": 1.0986,
3576
+ "step": 10060
3577
+ },
3578
+ {
3579
+ "epoch": 5.989304812834225,
3580
+ "grad_norm": 0.30666860938072205,
3581
+ "learning_rate": 2.860915492957747e-07,
3582
+ "loss": 1.3072,
3583
+ "step": 10080
3584
+ },
3585
+ {
3586
+ "epoch": 6.0,
3587
+ "eval_accuracy": 0.4074074074074074,
3588
+ "eval_loss": 2.2214395999908447,
3589
+ "eval_runtime": 105.6981,
3590
+ "eval_samples_per_second": 2.81,
3591
+ "eval_steps_per_second": 2.81,
3592
+ "step": 10098
3593
  }
3594
  ],
3595
  "logging_steps": 20,
3596
+ "max_steps": 10098,
3597
  "num_input_tokens_seen": 0,
3598
+ "num_train_epochs": 6,
3599
  "save_steps": 500,
3600
  "stateful_callbacks": {
3601
  "TrainerControl": {
 
3609
  "attributes": {}
3610
  }
3611
  },
3612
+ "total_flos": 2.5433536104702945e+18,
3613
  "train_batch_size": 1,
3614
  "trial_name": null,
3615
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c0e752b8f37590774467b277e14b9b30c1538a66a00e4b537633e9d5515a618
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06cc0c81e74876caeafbe898f65be97fa973b1744e977206810876d496bb4da
3
  size 5368