Azrail commited on
Commit
4607035
·
verified ·
1 Parent(s): e084b2b

Training in progress, step 157000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd4d2dd6c49affa11f1897287c3a92d51ded625ef3905a93fe2f4fccca4d24fe
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6fe11c67454b196001e4ed79995fab7a74672309c3dd2962d5f80ffb63bc57b
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52d890c69e743122f34fcfde97a19755e7b50f887489442d3dde7487815a67bf
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b377b9ca632415e5cc259fc332790fb52abc8eed12ac83f3e71264ccc731be8f
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1b8bd83e18b6a0a16b992311d9cb4c920119703c7f78eac118775c828860191
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7eb45c212b2ad29aa591c00be2416908d2afe25e585ed34f3beb9e136a92cef
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7335f317de8b0f5ab070f94b9642f4f3cbf0cbf47942933d3d4f3ba522b768a7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50e7a91a2d6de7899f3ef55596029edda575e3992fc80bcaed05a5c6cf935dee
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.426756581065395,
6
  "eval_steps": 500,
7
- "global_step": 156000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -27776,11 +27776,189 @@
27776
  "eval_steps_per_second": 15.575,
27777
  "num_input_tokens_seen": 90165273280,
27778
  "step": 156000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27779
  }
27780
  ],
27781
  "logging_steps": 50,
27782
  "max_steps": 200000,
27783
- "num_input_tokens_seen": 90165273280,
27784
  "num_train_epochs": 5,
27785
  "save_steps": 1000,
27786
  "stateful_callbacks": {
@@ -27795,7 +27973,7 @@
27795
  "attributes": {}
27796
  }
27797
  },
27798
- "total_flos": 1.5957608755975373e+20,
27799
  "train_batch_size": 32,
27800
  "trial_name": null,
27801
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.4487226393866526,
6
  "eval_steps": 500,
7
+ "global_step": 157000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
27776
  "eval_steps_per_second": 15.575,
27777
  "num_input_tokens_seen": 90165273280,
27778
  "step": 156000
27779
+ },
27780
+ {
27781
+ "epoch": 3.4278548839814578,
27782
+ "grad_norm": 0.09176173806190491,
27783
+ "learning_rate": 0.0001,
27784
+ "loss": 2.3456,
27785
+ "num_input_tokens_seen": 90217696288,
27786
+ "step": 156050
27787
+ },
27788
+ {
27789
+ "epoch": 3.428953186897521,
27790
+ "grad_norm": 0.089874766767025,
27791
+ "learning_rate": 0.0001,
27792
+ "loss": 2.3475,
27793
+ "num_input_tokens_seen": 90270125088,
27794
+ "step": 156100
27795
+ },
27796
+ {
27797
+ "epoch": 3.4300514898135837,
27798
+ "grad_norm": 0.0928313210606575,
27799
+ "learning_rate": 0.0001,
27800
+ "loss": 2.3515,
27801
+ "num_input_tokens_seen": 90322553888,
27802
+ "step": 156150
27803
+ },
27804
+ {
27805
+ "epoch": 3.4311497927296464,
27806
+ "grad_norm": 0.09114927798509598,
27807
+ "learning_rate": 0.0001,
27808
+ "loss": 2.3494,
27809
+ "num_input_tokens_seen": 90374980192,
27810
+ "step": 156200
27811
+ },
27812
+ {
27813
+ "epoch": 3.4322480956457095,
27814
+ "grad_norm": 0.09503049403429031,
27815
+ "learning_rate": 0.0001,
27816
+ "loss": 2.3439,
27817
+ "num_input_tokens_seen": 90427408992,
27818
+ "step": 156250
27819
+ },
27820
+ {
27821
+ "epoch": 3.4333463985617723,
27822
+ "grad_norm": 0.09301070868968964,
27823
+ "learning_rate": 0.0001,
27824
+ "loss": 2.3487,
27825
+ "num_input_tokens_seen": 90479837120,
27826
+ "step": 156300
27827
+ },
27828
+ {
27829
+ "epoch": 3.434444701477835,
27830
+ "grad_norm": 0.09190023690462112,
27831
+ "learning_rate": 0.0001,
27832
+ "loss": 2.3448,
27833
+ "num_input_tokens_seen": 90532258144,
27834
+ "step": 156350
27835
+ },
27836
+ {
27837
+ "epoch": 3.435543004393898,
27838
+ "grad_norm": 0.09636224061250687,
27839
+ "learning_rate": 0.0001,
27840
+ "loss": 2.3456,
27841
+ "num_input_tokens_seen": 90584686944,
27842
+ "step": 156400
27843
+ },
27844
+ {
27845
+ "epoch": 3.436641307309961,
27846
+ "grad_norm": 0.09875821322202682,
27847
+ "learning_rate": 0.0001,
27848
+ "loss": 2.3415,
27849
+ "num_input_tokens_seen": 90637115744,
27850
+ "step": 156450
27851
+ },
27852
+ {
27853
+ "epoch": 3.437739610226024,
27854
+ "grad_norm": 0.09387224912643433,
27855
+ "learning_rate": 0.0001,
27856
+ "loss": 2.3483,
27857
+ "num_input_tokens_seen": 90689540064,
27858
+ "step": 156500
27859
+ },
27860
+ {
27861
+ "epoch": 3.437739610226024,
27862
+ "eval_loss": 2.2581753730773926,
27863
+ "eval_runtime": 79.6955,
27864
+ "eval_samples_per_second": 62.739,
27865
+ "eval_steps_per_second": 15.685,
27866
+ "num_input_tokens_seen": 90689540064,
27867
+ "step": 156500
27868
+ },
27869
+ {
27870
+ "epoch": 3.4388379131420868,
27871
+ "grad_norm": 0.08944698423147202,
27872
+ "learning_rate": 0.0001,
27873
+ "loss": 2.3459,
27874
+ "num_input_tokens_seen": 90741968864,
27875
+ "step": 156550
27876
+ },
27877
+ {
27878
+ "epoch": 3.4399362160581495,
27879
+ "grad_norm": 0.09725566953420639,
27880
+ "learning_rate": 0.0001,
27881
+ "loss": 2.3399,
27882
+ "num_input_tokens_seen": 90794397664,
27883
+ "step": 156600
27884
+ },
27885
+ {
27886
+ "epoch": 3.4410345189742126,
27887
+ "grad_norm": 0.09932785481214523,
27888
+ "learning_rate": 0.0001,
27889
+ "loss": 2.3475,
27890
+ "num_input_tokens_seen": 90846826464,
27891
+ "step": 156650
27892
+ },
27893
+ {
27894
+ "epoch": 3.4421328218902754,
27895
+ "grad_norm": 0.09854361414909363,
27896
+ "learning_rate": 0.0001,
27897
+ "loss": 2.3449,
27898
+ "num_input_tokens_seen": 90899255264,
27899
+ "step": 156700
27900
+ },
27901
+ {
27902
+ "epoch": 3.4432311248063385,
27903
+ "grad_norm": 0.09402545541524887,
27904
+ "learning_rate": 0.0001,
27905
+ "loss": 2.3434,
27906
+ "num_input_tokens_seen": 90951684064,
27907
+ "step": 156750
27908
+ },
27909
+ {
27910
+ "epoch": 3.4443294277224012,
27911
+ "grad_norm": 0.09715921431779861,
27912
+ "learning_rate": 0.0001,
27913
+ "loss": 2.3455,
27914
+ "num_input_tokens_seen": 91004112864,
27915
+ "step": 156800
27916
+ },
27917
+ {
27918
+ "epoch": 3.445427730638464,
27919
+ "grad_norm": 0.09590257704257965,
27920
+ "learning_rate": 0.0001,
27921
+ "loss": 2.348,
27922
+ "num_input_tokens_seen": 91056541664,
27923
+ "step": 156850
27924
+ },
27925
+ {
27926
+ "epoch": 3.446526033554527,
27927
+ "grad_norm": 0.10155434161424637,
27928
+ "learning_rate": 0.0001,
27929
+ "loss": 2.3403,
27930
+ "num_input_tokens_seen": 91108970464,
27931
+ "step": 156900
27932
+ },
27933
+ {
27934
+ "epoch": 3.44762433647059,
27935
+ "grad_norm": 0.09132086485624313,
27936
+ "learning_rate": 0.0001,
27937
+ "loss": 2.3569,
27938
+ "num_input_tokens_seen": 91161399264,
27939
+ "step": 156950
27940
+ },
27941
+ {
27942
+ "epoch": 3.4487226393866526,
27943
+ "grad_norm": 0.0917491465806961,
27944
+ "learning_rate": 0.0001,
27945
+ "loss": 2.3454,
27946
+ "num_input_tokens_seen": 91213822304,
27947
+ "step": 157000
27948
+ },
27949
+ {
27950
+ "epoch": 3.4487226393866526,
27951
+ "eval_loss": 2.2578930854797363,
27952
+ "eval_runtime": 80.5074,
27953
+ "eval_samples_per_second": 62.106,
27954
+ "eval_steps_per_second": 15.527,
27955
+ "num_input_tokens_seen": 91213822304,
27956
+ "step": 157000
27957
  }
27958
  ],
27959
  "logging_steps": 50,
27960
  "max_steps": 200000,
27961
+ "num_input_tokens_seen": 91213822304,
27962
  "num_train_epochs": 5,
27963
  "save_steps": 1000,
27964
  "stateful_callbacks": {
 
27973
  "attributes": {}
27974
  }
27975
  },
27976
+ "total_flos": 1.614318280768918e+20,
27977
  "train_batch_size": 32,
27978
  "trial_name": null,
27979
  "trial_params": null