Azrail commited on
Commit
8b41612
·
verified ·
1 Parent(s): 16432f6

Training in progress, step 129000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8c3d49b98bfff4ce201de8fd57e1cb46f198541be8429619ddab0ad9d2161b3
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:857ead76dd55a0ff132114f3566b2633c2c5cdde85ae73d0787d641584b91007
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29a7f0ec5937d8a36844081698ed35de214589f8d2c33900c6101538c2a4386f
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d51187329bb716afa734f026372750945e338e23b7c661997a4d4207a6fd698
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:065d5f05cf0a782fa5b97e409b16ef2b4cf8c6102c4a9437ad899a13c927398f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01c311980c8b0da96dd9e638e23b1e84aa50fb6a11433bc22a347279b706965b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9d00381a4191263086f00c86313941ab13504158fdcedfcfacd5f658d7b3729
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a157573f4024c9cf3f191281f1d04ef870f25b0126e228157b25abffaa2ebf
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2211288264545597,
6
  "eval_steps": 500,
7
- "global_step": 128000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -22792,11 +22792,189 @@
22792
  "eval_steps_per_second": 14.933,
22793
  "num_input_tokens_seen": 67098059328,
22794
  "step": 128000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22795
  }
22796
  ],
22797
  "logging_steps": 50,
22798
  "max_steps": 140000,
22799
- "num_input_tokens_seen": 67098059328,
22800
  "num_train_epochs": 2,
22801
  "save_steps": 1000,
22802
  "stateful_callbacks": {
@@ -22811,7 +22989,7 @@
22811
  "attributes": {}
22812
  }
22813
  },
22814
- "total_flos": 1.1875132632453857e+20,
22815
  "train_batch_size": 32,
22816
  "trial_name": null,
22817
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.230668876778325,
6
  "eval_steps": 500,
7
+ "global_step": 129000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
22792
  "eval_steps_per_second": 14.933,
22793
  "num_input_tokens_seen": 67098059328,
22794
  "step": 128000
22795
+ },
22796
+ {
22797
+ "epoch": 1.2216058289707479,
22798
+ "grad_norm": 0.13075117766857147,
22799
+ "learning_rate": 0.00038600663175469667,
22800
+ "loss": 2.0582,
22801
+ "num_input_tokens_seen": 67124264448,
22802
+ "step": 128050
22803
+ },
22804
+ {
22805
+ "epoch": 1.222082831486936,
22806
+ "grad_norm": 0.1297282576560974,
22807
+ "learning_rate": 0.00038327731807204744,
22808
+ "loss": 2.0595,
22809
+ "num_input_tokens_seen": 67150472320,
22810
+ "step": 128100
22811
+ },
22812
+ {
22813
+ "epoch": 1.2225598340031243,
22814
+ "grad_norm": 0.12640318274497986,
22815
+ "learning_rate": 0.00038055167787050134,
22816
+ "loss": 2.0525,
22817
+ "num_input_tokens_seen": 67176672192,
22818
+ "step": 128150
22819
+ },
22820
+ {
22821
+ "epoch": 1.2230368365193127,
22822
+ "grad_norm": 0.1315733790397644,
22823
+ "learning_rate": 0.00037782979693105293,
22824
+ "loss": 2.0499,
22825
+ "num_input_tokens_seen": 67202877408,
22826
+ "step": 128200
22827
+ },
22828
+ {
22829
+ "epoch": 1.223513839035501,
22830
+ "grad_norm": 0.12865200638771057,
22831
+ "learning_rate": 0.0003751117609163865,
22832
+ "loss": 2.051,
22833
+ "num_input_tokens_seen": 67229091168,
22834
+ "step": 128250
22835
+ },
22836
+ {
22837
+ "epoch": 1.2239908415516891,
22838
+ "grad_norm": 0.1271800547838211,
22839
+ "learning_rate": 0.00037239765536817873,
22840
+ "loss": 2.0555,
22841
+ "num_input_tokens_seen": 67255304768,
22842
+ "step": 128300
22843
+ },
22844
+ {
22845
+ "epoch": 1.2244678440678776,
22846
+ "grad_norm": 0.13572408258914948,
22847
+ "learning_rate": 0.0003696875657044073,
22848
+ "loss": 2.0622,
22849
+ "num_input_tokens_seen": 67281509184,
22850
+ "step": 128350
22851
+ },
22852
+ {
22853
+ "epoch": 1.2249448465840658,
22854
+ "grad_norm": 0.12558363378047943,
22855
+ "learning_rate": 0.0003669815772166625,
22856
+ "loss": 2.0548,
22857
+ "num_input_tokens_seen": 67307717088,
22858
+ "step": 128400
22859
+ },
22860
+ {
22861
+ "epoch": 1.225421849100254,
22862
+ "grad_norm": 0.13062912225723267,
22863
+ "learning_rate": 0.0003642797750674629,
22864
+ "loss": 2.0473,
22865
+ "num_input_tokens_seen": 67333928800,
22866
+ "step": 128450
22867
+ },
22868
+ {
22869
+ "epoch": 1.2258988516164422,
22870
+ "grad_norm": 0.1351100355386734,
22871
+ "learning_rate": 0.00036158224428757535,
22872
+ "loss": 2.0475,
22873
+ "num_input_tokens_seen": 67360131616,
22874
+ "step": 128500
22875
+ },
22876
+ {
22877
+ "epoch": 1.2258988516164422,
22878
+ "eval_loss": 1.9701597690582275,
22879
+ "eval_runtime": 82.4081,
22880
+ "eval_samples_per_second": 60.674,
22881
+ "eval_steps_per_second": 15.168,
22882
+ "num_input_tokens_seen": 67360131616,
22883
+ "step": 128500
22884
+ },
22885
+ {
22886
+ "epoch": 1.2263758541326306,
22887
+ "grad_norm": 0.13211333751678467,
22888
+ "learning_rate": 0.00035888906977333857,
22889
+ "loss": 2.0622,
22890
+ "num_input_tokens_seen": 67386344736,
22891
+ "step": 128550
22892
+ },
22893
+ {
22894
+ "epoch": 1.2268528566488188,
22895
+ "grad_norm": 0.12648384273052216,
22896
+ "learning_rate": 0.0003562003362839914,
22897
+ "loss": 2.051,
22898
+ "num_input_tokens_seen": 67412555520,
22899
+ "step": 128600
22900
+ },
22901
+ {
22902
+ "epoch": 1.227329859165007,
22903
+ "grad_norm": 0.13109999895095825,
22904
+ "learning_rate": 0.00035351612843900553,
22905
+ "loss": 2.0529,
22906
+ "num_input_tokens_seen": 67438769504,
22907
+ "step": 128650
22908
+ },
22909
+ {
22910
+ "epoch": 1.2278068616811955,
22911
+ "grad_norm": 0.12981992959976196,
22912
+ "learning_rate": 0.000350836530715422,
22913
+ "loss": 2.045,
22914
+ "num_input_tokens_seen": 67464972864,
22915
+ "step": 128700
22916
+ },
22917
+ {
22918
+ "epoch": 1.2282838641973837,
22919
+ "grad_norm": 0.1246839389204979,
22920
+ "learning_rate": 0.00034816162744519263,
22921
+ "loss": 2.0569,
22922
+ "num_input_tokens_seen": 67491186176,
22923
+ "step": 128750
22924
+ },
22925
+ {
22926
+ "epoch": 1.2287608667135719,
22927
+ "grad_norm": 0.13077682256698608,
22928
+ "learning_rate": 0.00034549150281252633,
22929
+ "loss": 2.0461,
22930
+ "num_input_tokens_seen": 67517399168,
22931
+ "step": 128800
22932
+ },
22933
+ {
22934
+ "epoch": 1.22923786922976,
22935
+ "grad_norm": 0.12939219176769257,
22936
+ "learning_rate": 0.000342826240851239,
22937
+ "loss": 2.047,
22938
+ "num_input_tokens_seen": 67543606592,
22939
+ "step": 128850
22940
+ },
22941
+ {
22942
+ "epoch": 1.2297148717459485,
22943
+ "grad_norm": 0.12711487710475922,
22944
+ "learning_rate": 0.00034016592544210936,
22945
+ "loss": 2.0411,
22946
+ "num_input_tokens_seen": 67569807488,
22947
+ "step": 128900
22948
+ },
22949
+ {
22950
+ "epoch": 1.2301918742621367,
22951
+ "grad_norm": 0.13154172897338867,
22952
+ "learning_rate": 0.00033751064031023887,
22953
+ "loss": 2.0536,
22954
+ "num_input_tokens_seen": 67596020896,
22955
+ "step": 128950
22956
+ },
22957
+ {
22958
+ "epoch": 1.230668876778325,
22959
+ "grad_norm": 0.1312495321035385,
22960
+ "learning_rate": 0.00033486046902241664,
22961
+ "loss": 2.0558,
22962
+ "num_input_tokens_seen": 67622231264,
22963
+ "step": 129000
22964
+ },
22965
+ {
22966
+ "epoch": 1.230668876778325,
22967
+ "eval_loss": 1.9686726331710815,
22968
+ "eval_runtime": 82.3322,
22969
+ "eval_samples_per_second": 60.73,
22970
+ "eval_steps_per_second": 15.182,
22971
+ "num_input_tokens_seen": 67622231264,
22972
+ "step": 129000
22973
  }
22974
  ],
22975
  "logging_steps": 50,
22976
  "max_steps": 140000,
22977
+ "num_input_tokens_seen": 67622231264,
22978
  "num_train_epochs": 2,
22979
  "save_steps": 1000,
22980
  "stateful_callbacks": {
 
22989
  "attributes": {}
22990
  }
22991
  },
22992
+ "total_flos": 1.1967901504229745e+20,
22993
  "train_batch_size": 32,
22994
  "trial_name": null,
22995
  "trial_params": null