Azrail commited on
Commit
5769f26
·
verified ·
1 Parent(s): aa4f57e

Training in progress, step 27000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28394340845b35c88e8a63417e18c503dadf4a251790835d2715e5a4962f656e
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5b523d0237ff4825791520de6c6899e7d737f3dbfe8441d833895f1e2466285
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4565341b2daf769a1d6b98280e7a99c73d3df5a11f570b225860490fa5b0252c
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:149a5f50fc47d3d0a29e92a6c18a1d78db3365d41cfd7f18ae74185f9b0fbc4e
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b4ee497eed0fe85641f8ca254d6d7e11e60873712ef2108d29f717bef60c5dd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d1d738e0f013e71559a982b5bed46734a8c7b8ac496ca76379bed24380a52a1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ced8856b9ff194699de7fca54070bd17a17efd31d5f5d4d7e4c8ff1ec712ca9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5585e9833c9684d1dabff9cec651205ae9bf4f81ab2bb2b589702ce44919fbb3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5711171243154488,
6
  "eval_steps": 500,
7
- "global_step": 26000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4636,11 +4636,189 @@
4636
  "eval_steps_per_second": 18.896,
4637
  "num_input_tokens_seen": 27262972160,
4638
  "step": 26000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4639
  }
4640
  ],
4641
  "logging_steps": 50,
4642
  "max_steps": 200000,
4643
- "num_input_tokens_seen": 27262972160,
4644
  "num_train_epochs": 5,
4645
  "save_steps": 1000,
4646
  "stateful_callbacks": {
@@ -4655,7 +4833,7 @@
4655
  "attributes": {}
4656
  }
4657
  },
4658
- "total_flos": 1.5526462428179988e+19,
4659
  "train_batch_size": 64,
4660
  "trial_name": null,
4661
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5930831675583508,
6
  "eval_steps": 500,
7
+ "global_step": 27000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4636
  "eval_steps_per_second": 18.896,
4637
  "num_input_tokens_seen": 27262972160,
4638
  "step": 26000
4639
+ },
4640
+ {
4641
+ "epoch": 0.5722154264775939,
4642
+ "grad_norm": 0.1538383513689041,
4643
+ "learning_rate": 0.001,
4644
+ "loss": 2.6783,
4645
+ "num_input_tokens_seen": 27315400960,
4646
+ "step": 26050
4647
+ },
4648
+ {
4649
+ "epoch": 0.5733137286397391,
4650
+ "grad_norm": 0.15545998513698578,
4651
+ "learning_rate": 0.001,
4652
+ "loss": 2.6798,
4653
+ "num_input_tokens_seen": 27367829760,
4654
+ "step": 26100
4655
+ },
4656
+ {
4657
+ "epoch": 0.5744120308018842,
4658
+ "grad_norm": 0.15456970036029816,
4659
+ "learning_rate": 0.001,
4660
+ "loss": 2.6836,
4661
+ "num_input_tokens_seen": 27420258560,
4662
+ "step": 26150
4663
+ },
4664
+ {
4665
+ "epoch": 0.5755103329640292,
4666
+ "grad_norm": 0.1353277862071991,
4667
+ "learning_rate": 0.001,
4668
+ "loss": 2.6777,
4669
+ "num_input_tokens_seen": 27472687360,
4670
+ "step": 26200
4671
+ },
4672
+ {
4673
+ "epoch": 0.5766086351261743,
4674
+ "grad_norm": 0.15124258399009705,
4675
+ "learning_rate": 0.001,
4676
+ "loss": 2.681,
4677
+ "num_input_tokens_seen": 27525116160,
4678
+ "step": 26250
4679
+ },
4680
+ {
4681
+ "epoch": 0.5777069372883195,
4682
+ "grad_norm": 0.14200901985168457,
4683
+ "learning_rate": 0.001,
4684
+ "loss": 2.6827,
4685
+ "num_input_tokens_seen": 27577544960,
4686
+ "step": 26300
4687
+ },
4688
+ {
4689
+ "epoch": 0.5788052394504645,
4690
+ "grad_norm": 0.15356388688087463,
4691
+ "learning_rate": 0.001,
4692
+ "loss": 2.6802,
4693
+ "num_input_tokens_seen": 27629973760,
4694
+ "step": 26350
4695
+ },
4696
+ {
4697
+ "epoch": 0.5799035416126096,
4698
+ "grad_norm": 0.17395390570163727,
4699
+ "learning_rate": 0.001,
4700
+ "loss": 2.6921,
4701
+ "num_input_tokens_seen": 27682402560,
4702
+ "step": 26400
4703
+ },
4704
+ {
4705
+ "epoch": 0.5810018437747547,
4706
+ "grad_norm": 0.1507692188024521,
4707
+ "learning_rate": 0.001,
4708
+ "loss": 2.6811,
4709
+ "num_input_tokens_seen": 27734831360,
4710
+ "step": 26450
4711
+ },
4712
+ {
4713
+ "epoch": 0.5821001459368998,
4714
+ "grad_norm": 0.14512786269187927,
4715
+ "learning_rate": 0.001,
4716
+ "loss": 2.6798,
4717
+ "num_input_tokens_seen": 27787260160,
4718
+ "step": 26500
4719
+ },
4720
+ {
4721
+ "epoch": 0.5821001459368998,
4722
+ "eval_loss": 2.5802626609802246,
4723
+ "eval_runtime": 67.1032,
4724
+ "eval_samples_per_second": 74.512,
4725
+ "eval_steps_per_second": 18.628,
4726
+ "num_input_tokens_seen": 27787260160,
4727
+ "step": 26500
4728
+ },
4729
+ {
4730
+ "epoch": 0.5831984480990449,
4731
+ "grad_norm": 0.15365912020206451,
4732
+ "learning_rate": 0.001,
4733
+ "loss": 2.6813,
4734
+ "num_input_tokens_seen": 27839688960,
4735
+ "step": 26550
4736
+ },
4737
+ {
4738
+ "epoch": 0.58429675026119,
4739
+ "grad_norm": 0.14015646278858185,
4740
+ "learning_rate": 0.001,
4741
+ "loss": 2.6774,
4742
+ "num_input_tokens_seen": 27892117760,
4743
+ "step": 26600
4744
+ },
4745
+ {
4746
+ "epoch": 0.5853950524233351,
4747
+ "grad_norm": 0.1529797911643982,
4748
+ "learning_rate": 0.001,
4749
+ "loss": 2.6751,
4750
+ "num_input_tokens_seen": 27944546560,
4751
+ "step": 26650
4752
+ },
4753
+ {
4754
+ "epoch": 0.5864933545854801,
4755
+ "grad_norm": 0.16909636557102203,
4756
+ "learning_rate": 0.001,
4757
+ "loss": 2.6795,
4758
+ "num_input_tokens_seen": 27996975360,
4759
+ "step": 26700
4760
+ },
4761
+ {
4762
+ "epoch": 0.5875916567476253,
4763
+ "grad_norm": 0.14130276441574097,
4764
+ "learning_rate": 0.001,
4765
+ "loss": 2.6809,
4766
+ "num_input_tokens_seen": 28049404160,
4767
+ "step": 26750
4768
+ },
4769
+ {
4770
+ "epoch": 0.5886899589097704,
4771
+ "grad_norm": 0.15182790160179138,
4772
+ "learning_rate": 0.001,
4773
+ "loss": 2.685,
4774
+ "num_input_tokens_seen": 28101832960,
4775
+ "step": 26800
4776
+ },
4777
+ {
4778
+ "epoch": 0.5897882610719154,
4779
+ "grad_norm": 0.12757331132888794,
4780
+ "learning_rate": 0.001,
4781
+ "loss": 2.6766,
4782
+ "num_input_tokens_seen": 28154261760,
4783
+ "step": 26850
4784
+ },
4785
+ {
4786
+ "epoch": 0.5908865632340605,
4787
+ "grad_norm": 0.1527504026889801,
4788
+ "learning_rate": 0.001,
4789
+ "loss": 2.6767,
4790
+ "num_input_tokens_seen": 28206690560,
4791
+ "step": 26900
4792
+ },
4793
+ {
4794
+ "epoch": 0.5919848653962057,
4795
+ "grad_norm": 0.18337304890155792,
4796
+ "learning_rate": 0.001,
4797
+ "loss": 2.6752,
4798
+ "num_input_tokens_seen": 28259119360,
4799
+ "step": 26950
4800
+ },
4801
+ {
4802
+ "epoch": 0.5930831675583508,
4803
+ "grad_norm": 0.1472473442554474,
4804
+ "learning_rate": 0.001,
4805
+ "loss": 2.6717,
4806
+ "num_input_tokens_seen": 28311548160,
4807
+ "step": 27000
4808
+ },
4809
+ {
4810
+ "epoch": 0.5930831675583508,
4811
+ "eval_loss": 2.5781941413879395,
4812
+ "eval_runtime": 66.2194,
4813
+ "eval_samples_per_second": 75.507,
4814
+ "eval_steps_per_second": 18.877,
4815
+ "num_input_tokens_seen": 28311548160,
4816
+ "step": 27000
4817
  }
4818
  ],
4819
  "logging_steps": 50,
4820
  "max_steps": 200000,
4821
+ "num_input_tokens_seen": 28311548160,
4822
  "num_train_epochs": 5,
4823
  "save_steps": 1000,
4824
  "stateful_callbacks": {
 
4833
  "attributes": {}
4834
  }
4835
  },
4836
+ "total_flos": 1.6123634144144916e+19,
4837
  "train_batch_size": 64,
4838
  "trial_name": null,
4839
  "trial_params": null