Azrail commited on
Commit
8a137ae
·
verified ·
1 Parent(s): 453508b

Training in progress, step 134000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92ad31cc8051a774ff84bf50a2f043b12568d60c659ab713450ad489e60ff067
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d56ac5cac24a22412473f2135127ddabb38b319ea83b674e986a42239b250e9
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f98fbf6f84fc645d4e9351e4872ab3409232339169c895981f2ca6168553f54
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d6a881e9f26105deee08c944e754ddbf4c77f455ab89089e93e0141d4bbc5a
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eda9968c0f9e110957e79edd3603196e5c46bdd8acc1a9a916fa49100e905254
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b5f8b02f183c01b91dfb927bcee2fd08e29422009a0f8c863f42c2374d464d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c40f5e3cc10bc35190c452a89f96d672b73ffd5edfe6d4e72f9d0b88f5a7c9a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f96155b98d632c68f19e59b549aa9343e95b0d1b8978f18da42e6a70e5498d0e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2688290780733869,
6
  "eval_steps": 500,
7
- "global_step": 133000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23682,11 +23682,189 @@
23682
  "eval_steps_per_second": 15.141,
23683
  "num_input_tokens_seen": 69719047840,
23684
  "step": 133000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23685
  }
23686
  ],
23687
  "logging_steps": 50,
23688
  "max_steps": 140000,
23689
- "num_input_tokens_seen": 69719047840,
23690
  "num_train_epochs": 2,
23691
  "save_steps": 1000,
23692
  "stateful_callbacks": {
@@ -23701,7 +23879,7 @@
23701
  "attributes": {}
23702
  }
23703
  },
23704
- "total_flos": 1.2338999792247398e+20,
23705
  "train_batch_size": 32,
23706
  "trial_name": null,
23707
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2783691283971523,
6
  "eval_steps": 500,
7
+ "global_step": 134000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23682
  "eval_steps_per_second": 15.141,
23683
  "num_input_tokens_seen": 69719047840,
23684
  "step": 133000
23685
+ },
23686
+ {
23687
+ "epoch": 1.269306080589575,
23688
+ "grad_norm": 0.12358897924423218,
23689
+ "learning_rate": 0.00014446875342055988,
23690
+ "loss": 2.0342,
23691
+ "num_input_tokens_seen": 69745262240,
23692
+ "step": 133050
23693
+ },
23694
+ {
23695
+ "epoch": 1.2697830831057635,
23696
+ "grad_norm": 0.12031599134206772,
23697
+ "learning_rate": 0.00014250208666766236,
23698
+ "loss": 2.0402,
23699
+ "num_input_tokens_seen": 69771476640,
23700
+ "step": 133100
23701
+ },
23702
+ {
23703
+ "epoch": 1.2702600856219517,
23704
+ "grad_norm": 0.12011140584945679,
23705
+ "learning_rate": 0.00014054667104271496,
23706
+ "loss": 2.0358,
23707
+ "num_input_tokens_seen": 69797691040,
23708
+ "step": 133150
23709
+ },
23710
+ {
23711
+ "epoch": 1.27073708813814,
23712
+ "grad_norm": 0.12352379411458969,
23713
+ "learning_rate": 0.00013860256808630427,
23714
+ "loss": 2.043,
23715
+ "num_input_tokens_seen": 69823902816,
23716
+ "step": 133200
23717
+ },
23718
+ {
23719
+ "epoch": 1.271214090654328,
23720
+ "grad_norm": 0.1257781833410263,
23721
+ "learning_rate": 0.00013666983898298656,
23722
+ "loss": 2.0464,
23723
+ "num_input_tokens_seen": 69850112224,
23724
+ "step": 133250
23725
+ },
23726
+ {
23727
+ "epoch": 1.2716910931705165,
23728
+ "grad_norm": 0.12694838643074036,
23729
+ "learning_rate": 0.00013474854455936125,
23730
+ "loss": 2.0401,
23731
+ "num_input_tokens_seen": 69876325568,
23732
+ "step": 133300
23733
+ },
23734
+ {
23735
+ "epoch": 1.2721680956867047,
23736
+ "grad_norm": 0.12634819746017456,
23737
+ "learning_rate": 0.00013283874528215734,
23738
+ "loss": 2.0339,
23739
+ "num_input_tokens_seen": 69902536928,
23740
+ "step": 133350
23741
+ },
23742
+ {
23743
+ "epoch": 1.272645098202893,
23744
+ "grad_norm": 0.12307710945606232,
23745
+ "learning_rate": 0.00013094050125632973,
23746
+ "loss": 2.0277,
23747
+ "num_input_tokens_seen": 69928748288,
23748
+ "step": 133400
23749
+ },
23750
+ {
23751
+ "epoch": 1.2731221007190814,
23752
+ "grad_norm": 0.12187953293323517,
23753
+ "learning_rate": 0.00012905387222316822,
23754
+ "loss": 2.0402,
23755
+ "num_input_tokens_seen": 69954953888,
23756
+ "step": 133450
23757
+ },
23758
+ {
23759
+ "epoch": 1.2735991032352696,
23760
+ "grad_norm": 0.12032655626535416,
23761
+ "learning_rate": 0.0001271789175584172,
23762
+ "loss": 2.0419,
23763
+ "num_input_tokens_seen": 69981165632,
23764
+ "step": 133500
23765
+ },
23766
+ {
23767
+ "epoch": 1.2735991032352696,
23768
+ "eval_loss": 1.9568681716918945,
23769
+ "eval_runtime": 82.7406,
23770
+ "eval_samples_per_second": 60.43,
23771
+ "eval_steps_per_second": 15.107,
23772
+ "num_input_tokens_seen": 69981165632,
23773
+ "step": 133500
23774
+ },
23775
+ {
23776
+ "epoch": 1.2740761057514578,
23777
+ "grad_norm": 0.12817110121250153,
23778
+ "learning_rate": 0.00012531569627040635,
23779
+ "loss": 2.034,
23780
+ "num_input_tokens_seen": 70007368800,
23781
+ "step": 133550
23782
+ },
23783
+ {
23784
+ "epoch": 1.274553108267646,
23785
+ "grad_norm": 0.13095012307167053,
23786
+ "learning_rate": 0.00012346426699819457,
23787
+ "loss": 2.0346,
23788
+ "num_input_tokens_seen": 70033578048,
23789
+ "step": 133600
23790
+ },
23791
+ {
23792
+ "epoch": 1.2750301107838344,
23793
+ "grad_norm": 0.12582357227802277,
23794
+ "learning_rate": 0.00012162468800972342,
23795
+ "loss": 2.0398,
23796
+ "num_input_tokens_seen": 70059792448,
23797
+ "step": 133650
23798
+ },
23799
+ {
23800
+ "epoch": 1.2755071133000226,
23801
+ "grad_norm": 0.11612017452716827,
23802
+ "learning_rate": 0.00011979701719998454,
23803
+ "loss": 2.0341,
23804
+ "num_input_tokens_seen": 70086003648,
23805
+ "step": 133700
23806
+ },
23807
+ {
23808
+ "epoch": 1.2759841158162109,
23809
+ "grad_norm": 0.12256049364805222,
23810
+ "learning_rate": 0.00011798131208919626,
23811
+ "loss": 2.029,
23812
+ "num_input_tokens_seen": 70112204096,
23813
+ "step": 133750
23814
+ },
23815
+ {
23816
+ "epoch": 1.2764611183323993,
23817
+ "grad_norm": 0.11747635900974274,
23818
+ "learning_rate": 0.00011617762982099444,
23819
+ "loss": 2.0355,
23820
+ "num_input_tokens_seen": 70138411104,
23821
+ "step": 133800
23822
+ },
23823
+ {
23824
+ "epoch": 1.2769381208485875,
23825
+ "grad_norm": 0.12225272506475449,
23826
+ "learning_rate": 0.00011438602716063329,
23827
+ "loss": 2.042,
23828
+ "num_input_tokens_seen": 70164623328,
23829
+ "step": 133850
23830
+ },
23831
+ {
23832
+ "epoch": 1.2774151233647757,
23833
+ "grad_norm": 0.1293225735425949,
23834
+ "learning_rate": 0.00011260656049319957,
23835
+ "loss": 2.0367,
23836
+ "num_input_tokens_seen": 70190833888,
23837
+ "step": 133900
23838
+ },
23839
+ {
23840
+ "epoch": 1.277892125880964,
23841
+ "grad_norm": 0.12261593341827393,
23842
+ "learning_rate": 0.0001108392858218371,
23843
+ "loss": 2.0444,
23844
+ "num_input_tokens_seen": 70217043648,
23845
+ "step": 133950
23846
+ },
23847
+ {
23848
+ "epoch": 1.2783691283971523,
23849
+ "grad_norm": 0.11957214772701263,
23850
+ "learning_rate": 0.0001090842587659851,
23851
+ "loss": 2.0345,
23852
+ "num_input_tokens_seen": 70243253472,
23853
+ "step": 134000
23854
+ },
23855
+ {
23856
+ "epoch": 1.2783691283971523,
23857
+ "eval_loss": 1.955412745475769,
23858
+ "eval_runtime": 82.5981,
23859
+ "eval_samples_per_second": 60.534,
23860
+ "eval_steps_per_second": 15.134,
23861
+ "num_input_tokens_seen": 70243253472,
23862
+ "step": 134000
23863
  }
23864
  ],
23865
  "logging_steps": 50,
23866
  "max_steps": 140000,
23867
+ "num_input_tokens_seen": 70243253472,
23868
  "num_train_epochs": 2,
23869
  "save_steps": 1000,
23870
  "stateful_callbacks": {
 
23879
  "attributes": {}
23880
  }
23881
  },
23882
+ "total_flos": 1.243177462760067e+20,
23883
  "train_batch_size": 32,
23884
  "trial_name": null,
23885
  "trial_params": null