Azrail commited on
Commit
6d57010
·
verified ·
1 Parent(s): 55012fe

Training in progress, step 33000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f3efdd22645edf9b27968c44325394fb2b759ab91da7c8ce83b3d5624316247
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d3f9e40108aa240d3ccb2dec6c98e3c8dee794d5b181e301f16cb825f4f24c
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdf80f22e7a0541c733f265b4f922632311af4702895a6489cc3c6583b1b00ec
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34f35e26a2da9f3a49992a7401bf48035da49e1863e9c12106da901a102fce6c
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26f742dd126d572747f29fd7ba88348146ec68ecb2ae0d2effd91de53bff9d0d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f916fc54175e9c81473454541a77405165ddc25577e0b82acf56f2d60728d556
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b45d7a8e84e284c770af40e442ab0efb2fec2b035c2481cdfd246cdf35d0dd1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:281c918d3dcf25df4f5a9bbf64a4fd88f0fa5c69087d3374f9f2ce6266f988a9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7029133837728602,
6
  "eval_steps": 500,
7
- "global_step": 32000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5704,11 +5704,189 @@
5704
  "eval_steps_per_second": 18.716,
5705
  "num_input_tokens_seen": 33554428160,
5706
  "step": 32000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5707
  }
5708
  ],
5709
  "logging_steps": 50,
5710
  "max_steps": 200000,
5711
- "num_input_tokens_seen": 33554428160,
5712
  "num_train_epochs": 5,
5713
  "save_steps": 1000,
5714
  "stateful_callbacks": {
@@ -5723,7 +5901,7 @@
5723
  "attributes": {}
5724
  }
5725
  },
5726
- "total_flos": 1.9109492723969556e+19,
5727
  "train_batch_size": 64,
5728
  "trial_name": null,
5729
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.724879427015762,
6
  "eval_steps": 500,
7
+ "global_step": 33000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5704
  "eval_steps_per_second": 18.716,
5705
  "num_input_tokens_seen": 33554428160,
5706
  "step": 32000
5707
+ },
5708
+ {
5709
+ "epoch": 0.7040116859350052,
5710
+ "grad_norm": 0.15333816409111023,
5711
+ "learning_rate": 0.001,
5712
+ "loss": 2.6605,
5713
+ "num_input_tokens_seen": 33606856960,
5714
+ "step": 32050
5715
+ },
5716
+ {
5717
+ "epoch": 0.7051099880971503,
5718
+ "grad_norm": 0.14965052902698517,
5719
+ "learning_rate": 0.001,
5720
+ "loss": 2.6551,
5721
+ "num_input_tokens_seen": 33659285760,
5722
+ "step": 32100
5723
+ },
5724
+ {
5725
+ "epoch": 0.7062082902592954,
5726
+ "grad_norm": 0.1994074285030365,
5727
+ "learning_rate": 0.001,
5728
+ "loss": 2.6652,
5729
+ "num_input_tokens_seen": 33711714560,
5730
+ "step": 32150
5731
+ },
5732
+ {
5733
+ "epoch": 0.7073065924214406,
5734
+ "grad_norm": 0.3089894652366638,
5735
+ "learning_rate": 0.001,
5736
+ "loss": 2.6814,
5737
+ "num_input_tokens_seen": 33764143360,
5738
+ "step": 32200
5739
+ },
5740
+ {
5741
+ "epoch": 0.7084048945835856,
5742
+ "grad_norm": 0.14903652667999268,
5743
+ "learning_rate": 0.001,
5744
+ "loss": 2.6834,
5745
+ "num_input_tokens_seen": 33816572160,
5746
+ "step": 32250
5747
+ },
5748
+ {
5749
+ "epoch": 0.7095031967457307,
5750
+ "grad_norm": 0.17594854533672333,
5751
+ "learning_rate": 0.001,
5752
+ "loss": 2.6618,
5753
+ "num_input_tokens_seen": 33869000960,
5754
+ "step": 32300
5755
+ },
5756
+ {
5757
+ "epoch": 0.7106014989078758,
5758
+ "grad_norm": 0.15634667873382568,
5759
+ "learning_rate": 0.001,
5760
+ "loss": 2.6663,
5761
+ "num_input_tokens_seen": 33921429760,
5762
+ "step": 32350
5763
+ },
5764
+ {
5765
+ "epoch": 0.7116998010700208,
5766
+ "grad_norm": 0.13893702626228333,
5767
+ "learning_rate": 0.001,
5768
+ "loss": 2.67,
5769
+ "num_input_tokens_seen": 33973858560,
5770
+ "step": 32400
5771
+ },
5772
+ {
5773
+ "epoch": 0.712798103232166,
5774
+ "grad_norm": 0.16974663734436035,
5775
+ "learning_rate": 0.001,
5776
+ "loss": 2.6686,
5777
+ "num_input_tokens_seen": 34026287360,
5778
+ "step": 32450
5779
+ },
5780
+ {
5781
+ "epoch": 0.7138964053943111,
5782
+ "grad_norm": 0.15336968004703522,
5783
+ "learning_rate": 0.001,
5784
+ "loss": 2.6703,
5785
+ "num_input_tokens_seen": 34078716160,
5786
+ "step": 32500
5787
+ },
5788
+ {
5789
+ "epoch": 0.7138964053943111,
5790
+ "eval_loss": 2.5648574829101562,
5791
+ "eval_runtime": 66.0796,
5792
+ "eval_samples_per_second": 75.666,
5793
+ "eval_steps_per_second": 18.917,
5794
+ "num_input_tokens_seen": 34078716160,
5795
+ "step": 32500
5796
+ },
5797
+ {
5798
+ "epoch": 0.7149947075564561,
5799
+ "grad_norm": 1.428727626800537,
5800
+ "learning_rate": 0.001,
5801
+ "loss": 2.8433,
5802
+ "num_input_tokens_seen": 34131144960,
5803
+ "step": 32550
5804
+ },
5805
+ {
5806
+ "epoch": 0.7160930097186012,
5807
+ "grad_norm": 0.1666879504919052,
5808
+ "learning_rate": 0.001,
5809
+ "loss": 2.7236,
5810
+ "num_input_tokens_seen": 34183573760,
5811
+ "step": 32600
5812
+ },
5813
+ {
5814
+ "epoch": 0.7171913118807464,
5815
+ "grad_norm": 0.16038021445274353,
5816
+ "learning_rate": 0.001,
5817
+ "loss": 2.6876,
5818
+ "num_input_tokens_seen": 34236002560,
5819
+ "step": 32650
5820
+ },
5821
+ {
5822
+ "epoch": 0.7182896140428915,
5823
+ "grad_norm": 0.1514110267162323,
5824
+ "learning_rate": 0.001,
5825
+ "loss": 2.6717,
5826
+ "num_input_tokens_seen": 34288431360,
5827
+ "step": 32700
5828
+ },
5829
+ {
5830
+ "epoch": 0.7193879162050365,
5831
+ "grad_norm": 0.13304661214351654,
5832
+ "learning_rate": 0.001,
5833
+ "loss": 2.6664,
5834
+ "num_input_tokens_seen": 34340860160,
5835
+ "step": 32750
5836
+ },
5837
+ {
5838
+ "epoch": 0.7204862183671816,
5839
+ "grad_norm": 0.15957415103912354,
5840
+ "learning_rate": 0.001,
5841
+ "loss": 2.6683,
5842
+ "num_input_tokens_seen": 34393288960,
5843
+ "step": 32800
5844
+ },
5845
+ {
5846
+ "epoch": 0.7215845205293268,
5847
+ "grad_norm": 0.14532499015331268,
5848
+ "learning_rate": 0.001,
5849
+ "loss": 2.6632,
5850
+ "num_input_tokens_seen": 34445717760,
5851
+ "step": 32850
5852
+ },
5853
+ {
5854
+ "epoch": 0.7226828226914718,
5855
+ "grad_norm": 0.1402454972267151,
5856
+ "learning_rate": 0.001,
5857
+ "loss": 2.6631,
5858
+ "num_input_tokens_seen": 34498146560,
5859
+ "step": 32900
5860
+ },
5861
+ {
5862
+ "epoch": 0.7237811248536169,
5863
+ "grad_norm": 0.17248420417308807,
5864
+ "learning_rate": 0.001,
5865
+ "loss": 2.6743,
5866
+ "num_input_tokens_seen": 34550575360,
5867
+ "step": 32950
5868
+ },
5869
+ {
5870
+ "epoch": 0.724879427015762,
5871
+ "grad_norm": 0.1455400288105011,
5872
+ "learning_rate": 0.001,
5873
+ "loss": 2.6598,
5874
+ "num_input_tokens_seen": 34603004160,
5875
+ "step": 33000
5876
+ },
5877
+ {
5878
+ "epoch": 0.724879427015762,
5879
+ "eval_loss": 2.5639312267303467,
5880
+ "eval_runtime": 66.9575,
5881
+ "eval_samples_per_second": 74.674,
5882
+ "eval_steps_per_second": 18.669,
5883
+ "num_input_tokens_seen": 34603004160,
5884
+ "step": 33000
5885
  }
5886
  ],
5887
  "logging_steps": 50,
5888
  "max_steps": 200000,
5889
+ "num_input_tokens_seen": 34603004160,
5890
  "num_train_epochs": 5,
5891
  "save_steps": 1000,
5892
  "stateful_callbacks": {
 
5901
  "attributes": {}
5902
  }
5903
  },
5904
+ "total_flos": 1.9706664439934484e+19,
5905
  "train_batch_size": 64,
5906
  "trial_name": null,
5907
  "trial_params": null