TweedleDeepLearnings commited on
Commit
d2b69a2
·
verified ·
1 Parent(s): d71c81c

Training in progress, step 2700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c88db1333f5ed8a9c0ff0459f14879ebf7617fb770c07db8372de8d4743d28e
3
  size 676264504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c393ddf30ac6460737104f4d2745ea8242503edafcb67bec75c33d2824d1f5d7
3
  size 676264504
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e66a49968d29fdbbfe4edd3b2b7407ceac7b90ecf6cfddb3d526afcdd2d8b98b
3
  size 1274083770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e98b29c362e755d979244b4e208a8a66c23be4d6ed7b48312a10de0123f98f74
3
  size 1274083770
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dd6222f0c699c537056f765320e14a81115abae3ade22fd8035e8e200f47007
3
  size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63c0140fc143ef9af84d50f32dfbe3d22386c0a90be84aa48736592e7040518e
3
  size 14180
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:993cde09f9ab221e14600f36e616bed6bfd66828a4b3ca55574b06efb8b5baa9
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19994e30e16381567f2412578e95061bd82c7d292508d95caf108c1a628ea6fc
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.0004596015496645123,
3
  "best_model_checkpoint": "./output/checkpoint-1800",
4
- "epoch": 2.954808806488992,
5
  "eval_steps": 150,
6
- "global_step": 2550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1928,6 +1928,119 @@
1928
  "eval_samples_per_second": 11.144,
1929
  "eval_steps_per_second": 11.144,
1930
  "step": 2550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1931
  }
1932
  ],
1933
  "logging_steps": 10,
@@ -1947,7 +2060,7 @@
1947
  "attributes": {}
1948
  }
1949
  },
1950
- "total_flos": 2.0534904848584704e+17,
1951
  "train_batch_size": 8,
1952
  "trial_name": null,
1953
  "trial_params": null
 
1
  {
2
  "best_metric": 0.0004596015496645123,
3
  "best_model_checkpoint": "./output/checkpoint-1800",
4
+ "epoch": 3.1286210892236386,
5
  "eval_steps": 150,
6
+ "global_step": 2700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1928
  "eval_samples_per_second": 11.144,
1929
  "eval_steps_per_second": 11.144,
1930
  "step": 2550
1931
+ },
1932
+ {
1933
+ "epoch": 2.966396292004635,
1934
+ "grad_norm": 0.0019562735687941313,
1935
+ "learning_rate": 2.1858949867611754e-05,
1936
+ "loss": 0.0035,
1937
+ "step": 2560
1938
+ },
1939
+ {
1940
+ "epoch": 2.977983777520278,
1941
+ "grad_norm": 0.08442725241184235,
1942
+ "learning_rate": 2.1717905533241997e-05,
1943
+ "loss": 0.0001,
1944
+ "step": 2570
1945
+ },
1946
+ {
1947
+ "epoch": 2.9895712630359212,
1948
+ "grad_norm": 0.012547359801828861,
1949
+ "learning_rate": 2.157687279467088e-05,
1950
+ "loss": 0.0071,
1951
+ "step": 2580
1952
+ },
1953
+ {
1954
+ "epoch": 3.0011587485515645,
1955
+ "grad_norm": 0.0036566180642694235,
1956
+ "learning_rate": 2.14358574492019e-05,
1957
+ "loss": 0.0001,
1958
+ "step": 2590
1959
+ },
1960
+ {
1961
+ "epoch": 3.0127462340672073,
1962
+ "grad_norm": 0.003990466240793467,
1963
+ "learning_rate": 2.1294865293423586e-05,
1964
+ "loss": 0.0001,
1965
+ "step": 2600
1966
+ },
1967
+ {
1968
+ "epoch": 3.0243337195828506,
1969
+ "grad_norm": 0.006470364052802324,
1970
+ "learning_rate": 2.1153902122971233e-05,
1971
+ "loss": 0.0001,
1972
+ "step": 2610
1973
+ },
1974
+ {
1975
+ "epoch": 3.035921205098494,
1976
+ "grad_norm": 17.235862731933594,
1977
+ "learning_rate": 2.101297373228868e-05,
1978
+ "loss": 0.0062,
1979
+ "step": 2620
1980
+ },
1981
+ {
1982
+ "epoch": 3.0475086906141367,
1983
+ "grad_norm": 0.003931706305593252,
1984
+ "learning_rate": 2.087208591439006e-05,
1985
+ "loss": 0.0001,
1986
+ "step": 2630
1987
+ },
1988
+ {
1989
+ "epoch": 3.05909617612978,
1990
+ "grad_norm": 0.016153201460838318,
1991
+ "learning_rate": 2.0731244460621764e-05,
1992
+ "loss": 0.0001,
1993
+ "step": 2640
1994
+ },
1995
+ {
1996
+ "epoch": 3.0706836616454227,
1997
+ "grad_norm": 0.02430218830704689,
1998
+ "learning_rate": 2.0590455160424316e-05,
1999
+ "loss": 0.0015,
2000
+ "step": 2650
2001
+ },
2002
+ {
2003
+ "epoch": 3.082271147161066,
2004
+ "grad_norm": 0.01603887602686882,
2005
+ "learning_rate": 2.044972380109441e-05,
2006
+ "loss": 0.0004,
2007
+ "step": 2660
2008
+ },
2009
+ {
2010
+ "epoch": 3.0938586326767092,
2011
+ "grad_norm": 0.0475073866546154,
2012
+ "learning_rate": 2.030905616754704e-05,
2013
+ "loss": 0.0001,
2014
+ "step": 2670
2015
+ },
2016
+ {
2017
+ "epoch": 3.105446118192352,
2018
+ "grad_norm": 0.004527157172560692,
2019
+ "learning_rate": 2.0168458042077636e-05,
2020
+ "loss": 0.0001,
2021
+ "step": 2680
2022
+ },
2023
+ {
2024
+ "epoch": 3.1170336037079953,
2025
+ "grad_norm": 0.003917761612683535,
2026
+ "learning_rate": 2.0027935204124465e-05,
2027
+ "loss": 0.0001,
2028
+ "step": 2690
2029
+ },
2030
+ {
2031
+ "epoch": 3.1286210892236386,
2032
+ "grad_norm": 0.0348745621740818,
2033
+ "learning_rate": 1.9887493430031e-05,
2034
+ "loss": 0.0001,
2035
+ "step": 2700
2036
+ },
2037
+ {
2038
+ "epoch": 3.1286210892236386,
2039
+ "eval_loss": 0.0028791166841983795,
2040
+ "eval_runtime": 45.6443,
2041
+ "eval_samples_per_second": 10.954,
2042
+ "eval_steps_per_second": 10.954,
2043
+ "step": 2700
2044
  }
2045
  ],
2046
  "logging_steps": 10,
 
2060
  "attributes": {}
2061
  }
2062
  },
2063
+ "total_flos": 2.1743560581808128e+17,
2064
  "train_batch_size": 8,
2065
  "trial_name": null,
2066
  "trial_params": null