Azrail commited on
Commit
c0493db
·
verified ·
1 Parent(s): c49d237

Training in progress, step 50000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d2189cc3a98b2403601c139a530c8e21835cb6237e1f4942ace6213b73fce5f
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c9956ceaa01a8262c17e82fea9ac349503f1643baa686fe83baf73d6c182cfd
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec0537d11321458817927ebac3e783711d3aa86865e3823d0bc93d1e41dfc5d1
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9a8f8b0ff9c7ab62e432b714de9517f6859e2ebcb731ff15954b08eab3fa5fd
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02dd1579b4e4c484590ab9c87fcdb4df0578497bcab5d7d028a086e5a9506abe
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d6a67f7616cccd33f77a5e076df0611e7b35eb8ba28bbeb4122e81eca5afa0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85feca9ffa4367dad07b4142308894db505807fd169bd3aedff12898c8f097e0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66168c288d1955c1c664cfa64be79d9023fb79ca5529a1e6b201d572885b2dfe
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.23373123293225373,
6
  "eval_steps": 500,
7
- "global_step": 49000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8730,11 +8730,189 @@
8730
  "eval_steps_per_second": 23.197,
8731
  "num_input_tokens_seen": 12845051456,
8732
  "step": 49000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8733
  }
8734
  ],
8735
  "logging_steps": 50,
8736
  "max_steps": 70000,
8737
- "num_input_tokens_seen": 12845051456,
8738
  "num_train_epochs": 1,
8739
  "save_steps": 1000,
8740
  "stateful_callbacks": {
@@ -8749,7 +8927,7 @@
8749
  "attributes": {}
8750
  }
8751
  },
8752
- "total_flos": 3.4361766321822106e+18,
8753
  "train_batch_size": 64,
8754
  "trial_name": null,
8755
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.23850125809413644,
6
  "eval_steps": 500,
7
+ "global_step": 50000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8730
  "eval_steps_per_second": 23.197,
8731
  "num_input_tokens_seen": 12845051456,
8732
  "step": 49000
8733
+ },
8734
+ {
8735
+ "epoch": 0.23396973419034786,
8736
+ "grad_norm": 0.209337517619133,
8737
+ "learning_rate": 0.001,
8738
+ "loss": 2.6348,
8739
+ "num_input_tokens_seen": 12858158656,
8740
+ "step": 49050
8741
+ },
8742
+ {
8743
+ "epoch": 0.234208235448442,
8744
+ "grad_norm": 0.1974038928747177,
8745
+ "learning_rate": 0.001,
8746
+ "loss": 2.6158,
8747
+ "num_input_tokens_seen": 12871265856,
8748
+ "step": 49100
8749
+ },
8750
+ {
8751
+ "epoch": 0.23444673670653612,
8752
+ "grad_norm": 0.28099164366722107,
8753
+ "learning_rate": 0.001,
8754
+ "loss": 2.6101,
8755
+ "num_input_tokens_seen": 12884373056,
8756
+ "step": 49150
8757
+ },
8758
+ {
8759
+ "epoch": 0.23468523796463026,
8760
+ "grad_norm": 0.2172873318195343,
8761
+ "learning_rate": 0.001,
8762
+ "loss": 2.596,
8763
+ "num_input_tokens_seen": 12897480256,
8764
+ "step": 49200
8765
+ },
8766
+ {
8767
+ "epoch": 0.2349237392227244,
8768
+ "grad_norm": 0.2120896875858307,
8769
+ "learning_rate": 0.001,
8770
+ "loss": 2.5994,
8771
+ "num_input_tokens_seen": 12910587456,
8772
+ "step": 49250
8773
+ },
8774
+ {
8775
+ "epoch": 0.23516224048081855,
8776
+ "grad_norm": 0.20109935104846954,
8777
+ "learning_rate": 0.001,
8778
+ "loss": 2.6101,
8779
+ "num_input_tokens_seen": 12923694656,
8780
+ "step": 49300
8781
+ },
8782
+ {
8783
+ "epoch": 0.23540074173891268,
8784
+ "grad_norm": 0.20735585689544678,
8785
+ "learning_rate": 0.001,
8786
+ "loss": 2.6142,
8787
+ "num_input_tokens_seen": 12936801856,
8788
+ "step": 49350
8789
+ },
8790
+ {
8791
+ "epoch": 0.2356392429970068,
8792
+ "grad_norm": 0.21295137703418732,
8793
+ "learning_rate": 0.001,
8794
+ "loss": 2.6226,
8795
+ "num_input_tokens_seen": 12949909056,
8796
+ "step": 49400
8797
+ },
8798
+ {
8799
+ "epoch": 0.23587774425510094,
8800
+ "grad_norm": 0.20560845732688904,
8801
+ "learning_rate": 0.001,
8802
+ "loss": 2.6027,
8803
+ "num_input_tokens_seen": 12963016256,
8804
+ "step": 49450
8805
+ },
8806
+ {
8807
+ "epoch": 0.23611624551319507,
8808
+ "grad_norm": 0.33747321367263794,
8809
+ "learning_rate": 0.001,
8810
+ "loss": 2.6231,
8811
+ "num_input_tokens_seen": 12976123456,
8812
+ "step": 49500
8813
+ },
8814
+ {
8815
+ "epoch": 0.23611624551319507,
8816
+ "eval_loss": 2.5008058547973633,
8817
+ "eval_runtime": 54.2104,
8818
+ "eval_samples_per_second": 92.233,
8819
+ "eval_steps_per_second": 23.058,
8820
+ "num_input_tokens_seen": 12976123456,
8821
+ "step": 49500
8822
+ },
8823
+ {
8824
+ "epoch": 0.23635474677128923,
8825
+ "grad_norm": 0.24593485891819,
8826
+ "learning_rate": 0.001,
8827
+ "loss": 2.6336,
8828
+ "num_input_tokens_seen": 12989230656,
8829
+ "step": 49550
8830
+ },
8831
+ {
8832
+ "epoch": 0.23659324802938336,
8833
+ "grad_norm": 0.25253933668136597,
8834
+ "learning_rate": 0.001,
8835
+ "loss": 2.643,
8836
+ "num_input_tokens_seen": 13002337856,
8837
+ "step": 49600
8838
+ },
8839
+ {
8840
+ "epoch": 0.2368317492874775,
8841
+ "grad_norm": 0.24231670796871185,
8842
+ "learning_rate": 0.001,
8843
+ "loss": 2.6074,
8844
+ "num_input_tokens_seen": 13015445056,
8845
+ "step": 49650
8846
+ },
8847
+ {
8848
+ "epoch": 0.23707025054557163,
8849
+ "grad_norm": 0.2178962677717209,
8850
+ "learning_rate": 0.001,
8851
+ "loss": 2.6184,
8852
+ "num_input_tokens_seen": 13028552256,
8853
+ "step": 49700
8854
+ },
8855
+ {
8856
+ "epoch": 0.23730875180366576,
8857
+ "grad_norm": 0.2651260793209076,
8858
+ "learning_rate": 0.001,
8859
+ "loss": 2.6335,
8860
+ "num_input_tokens_seen": 13041659456,
8861
+ "step": 49750
8862
+ },
8863
+ {
8864
+ "epoch": 0.2375472530617599,
8865
+ "grad_norm": 0.1909639537334442,
8866
+ "learning_rate": 0.001,
8867
+ "loss": 2.61,
8868
+ "num_input_tokens_seen": 13054766656,
8869
+ "step": 49800
8870
+ },
8871
+ {
8872
+ "epoch": 0.23778575431985405,
8873
+ "grad_norm": 0.21107855439186096,
8874
+ "learning_rate": 0.001,
8875
+ "loss": 2.6333,
8876
+ "num_input_tokens_seen": 13067873856,
8877
+ "step": 49850
8878
+ },
8879
+ {
8880
+ "epoch": 0.23802425557794818,
8881
+ "grad_norm": 0.19366736710071564,
8882
+ "learning_rate": 0.001,
8883
+ "loss": 2.6068,
8884
+ "num_input_tokens_seen": 13080981056,
8885
+ "step": 49900
8886
+ },
8887
+ {
8888
+ "epoch": 0.2382627568360423,
8889
+ "grad_norm": 0.2851523458957672,
8890
+ "learning_rate": 0.001,
8891
+ "loss": 2.6183,
8892
+ "num_input_tokens_seen": 13094088256,
8893
+ "step": 49950
8894
+ },
8895
+ {
8896
+ "epoch": 0.23850125809413644,
8897
+ "grad_norm": 0.23617912828922272,
8898
+ "learning_rate": 0.001,
8899
+ "loss": 2.617,
8900
+ "num_input_tokens_seen": 13107195456,
8901
+ "step": 50000
8902
+ },
8903
+ {
8904
+ "epoch": 0.23850125809413644,
8905
+ "eval_loss": 2.497406005859375,
8906
+ "eval_runtime": 53.6538,
8907
+ "eval_samples_per_second": 93.19,
8908
+ "eval_steps_per_second": 23.298,
8909
+ "num_input_tokens_seen": 13107195456,
8910
+ "step": 50000
8911
  }
8912
  ],
8913
  "logging_steps": 50,
8914
  "max_steps": 70000,
8915
+ "num_input_tokens_seen": 13107195456,
8916
  "num_train_epochs": 1,
8917
  "save_steps": 1000,
8918
  "stateful_callbacks": {
 
8927
  "attributes": {}
8928
  }
8929
  },
8930
+ "total_flos": 3.5063027107076506e+18,
8931
  "train_batch_size": 64,
8932
  "trial_name": null,
8933
  "trial_params": null