FredericFan commited on
Commit
6b34e34
·
verified ·
1 Parent(s): 28f848a

Training in progress, step 12500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6989d08304817f7eda2a41817fe6fdf1039613dcc5ce63bc28b05cebb5b0a729
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a82551d15f6384cb873b544e144c1f4a47713f4d120acfb3fbb93f105eb65230
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ef2622b356cc24fc03d43081c989231e41a9f98afae1477feeac80752fcbe27
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2430d0f4d73641065e9bf5dd721f5eb31d0400c5f5950d844482b26db026c2da
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25c75df0b33157e12f11389fab2a782cb08091622aab636059c475b184fac12c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8295ae519bb42d65fe3f9c31072a390aa31ad86276d0cec6598329f84d5b468f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ba896b8260293f6060945e9d4b6d72d879757324e56407782c7dd941c44937b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2529f9270ae0263cc98da512a0bbd9cfce3aaadf2f105e2a962446a19cb9d893
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.082674041390419,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-12000",
4
- "epoch": 0.96,
5
  "eval_steps": 500,
6
- "global_step": 12000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1879,6 +1879,84 @@
1879
  "eval_samples_per_second": 22.714,
1880
  "eval_steps_per_second": 5.679,
1881
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1882
  }
1883
  ],
1884
  "logging_steps": 50,
@@ -1898,7 +1976,7 @@
1898
  "attributes": {}
1899
  }
1900
  },
1901
- "total_flos": 2.922997874688e+16,
1902
  "train_batch_size": 4,
1903
  "trial_name": null,
1904
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.0824647843837738,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 12500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1879
  "eval_samples_per_second": 22.714,
1880
  "eval_steps_per_second": 5.679,
1881
  "step": 12000
1882
+ },
1883
+ {
1884
+ "epoch": 0.964,
1885
+ "grad_norm": 0.1296168565750122,
1886
+ "learning_rate": 1.55424e-05,
1887
+ "loss": 0.0638,
1888
+ "step": 12050
1889
+ },
1890
+ {
1891
+ "epoch": 0.968,
1892
+ "grad_norm": 0.14450989663600922,
1893
+ "learning_rate": 1.54824e-05,
1894
+ "loss": 0.0623,
1895
+ "step": 12100
1896
+ },
1897
+ {
1898
+ "epoch": 0.972,
1899
+ "grad_norm": 0.09367354214191437,
1900
+ "learning_rate": 1.54224e-05,
1901
+ "loss": 0.0618,
1902
+ "step": 12150
1903
+ },
1904
+ {
1905
+ "epoch": 0.976,
1906
+ "grad_norm": 0.13332900404930115,
1907
+ "learning_rate": 1.53624e-05,
1908
+ "loss": 0.0608,
1909
+ "step": 12200
1910
+ },
1911
+ {
1912
+ "epoch": 0.98,
1913
+ "grad_norm": 0.14300012588500977,
1914
+ "learning_rate": 1.53024e-05,
1915
+ "loss": 0.0695,
1916
+ "step": 12250
1917
+ },
1918
+ {
1919
+ "epoch": 0.984,
1920
+ "grad_norm": 0.14829818904399872,
1921
+ "learning_rate": 1.52424e-05,
1922
+ "loss": 0.0589,
1923
+ "step": 12300
1924
+ },
1925
+ {
1926
+ "epoch": 0.988,
1927
+ "grad_norm": 0.08889272063970566,
1928
+ "learning_rate": 1.5182399999999999e-05,
1929
+ "loss": 0.0578,
1930
+ "step": 12350
1931
+ },
1932
+ {
1933
+ "epoch": 0.992,
1934
+ "grad_norm": 0.18994523584842682,
1935
+ "learning_rate": 1.51224e-05,
1936
+ "loss": 0.0667,
1937
+ "step": 12400
1938
+ },
1939
+ {
1940
+ "epoch": 0.996,
1941
+ "grad_norm": 0.16152743995189667,
1942
+ "learning_rate": 1.50624e-05,
1943
+ "loss": 0.0552,
1944
+ "step": 12450
1945
+ },
1946
+ {
1947
+ "epoch": 1.0,
1948
+ "grad_norm": 0.2399597018957138,
1949
+ "learning_rate": 1.5002399999999999e-05,
1950
+ "loss": 0.0632,
1951
+ "step": 12500
1952
+ },
1953
+ {
1954
+ "epoch": 1.0,
1955
+ "eval_loss": 0.0824647843837738,
1956
+ "eval_runtime": 88.0514,
1957
+ "eval_samples_per_second": 22.714,
1958
+ "eval_steps_per_second": 5.678,
1959
+ "step": 12500
1960
  }
1961
  ],
1962
  "logging_steps": 50,
 
1976
  "attributes": {}
1977
  }
1978
  },
1979
+ "total_flos": 3.0447894528e+16,
1980
  "train_batch_size": 4,
1981
  "trial_name": null,
1982
  "trial_params": null