Azrail commited on
Commit
2455d42
·
verified ·
1 Parent(s): 51c02d4

Training in progress, step 55000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7005ee4ac699efbe46e787cdaab363f958cca84ce68e125ca53c53198e13eeac
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62e4ec5f596aeddac39f75a6501f66ecd7eb297d85fd39f281237c384adec887
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5151b63ca0c165877166c8eeb6faa3b784251ae57745f30c89f3dbaf08defd7
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa82a32a09e79af011cf35188194304359148308b76399c6d5815593f337709
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e516d1931a63763a7fdfb84f01f54aaada25beb218520b62969ba08ff897cee4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5eacfa99e53a8a1de73851121ef39f03223e9cc67398ac06a0e84e6dbf4ae3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b89459823d581d70469027e8df5427d5b9a07aadbd42c55eac43368b994e74e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5df6e1f8ed049732a2e5d49c46b32207c644d0cb43e6b3e615ea32a67128cbab
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.36323093920422156,
6
  "eval_steps": 500,
7
- "global_step": 54000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9620,11 +9620,189 @@
9620
  "eval_steps_per_second": 23.494,
9621
  "num_input_tokens_seen": 14155776000,
9622
  "step": 54000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9623
  }
9624
  ],
9625
  "logging_steps": 50,
9626
  "max_steps": 60000,
9627
- "num_input_tokens_seen": 14155776000,
9628
  "num_train_epochs": 1,
9629
  "save_steps": 1000,
9630
  "stateful_callbacks": {
@@ -9639,7 +9817,7 @@
9639
  "attributes": {}
9640
  }
9641
  },
9642
- "total_flos": 3.78680824037376e+18,
9643
  "train_batch_size": 64,
9644
  "trial_name": null,
9645
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3699574380783738,
6
  "eval_steps": 500,
7
+ "global_step": 55000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9620
  "eval_steps_per_second": 23.494,
9621
  "num_input_tokens_seen": 14155776000,
9622
  "step": 54000
9623
+ },
9624
+ {
9625
+ "epoch": 0.36356726414792917,
9626
+ "grad_norm": 0.2197147160768509,
9627
+ "learning_rate": 0.0009998286624877785,
9628
+ "loss": 3.0502,
9629
+ "num_input_tokens_seen": 14168883200,
9630
+ "step": 54050
9631
+ },
9632
+ {
9633
+ "epoch": 0.3639035890916368,
9634
+ "grad_norm": 0.22259306907653809,
9635
+ "learning_rate": 0.0009993147673772868,
9636
+ "loss": 3.0433,
9637
+ "num_input_tokens_seen": 14181990400,
9638
+ "step": 54100
9639
+ },
9640
+ {
9641
+ "epoch": 0.3642399140353444,
9642
+ "grad_norm": 0.19341766834259033,
9643
+ "learning_rate": 0.000998458666866564,
9644
+ "loss": 3.0486,
9645
+ "num_input_tokens_seen": 14195097600,
9646
+ "step": 54150
9647
+ },
9648
+ {
9649
+ "epoch": 0.364576238979052,
9650
+ "grad_norm": 0.2313617616891861,
9651
+ "learning_rate": 0.0009972609476841367,
9652
+ "loss": 3.0446,
9653
+ "num_input_tokens_seen": 14208204800,
9654
+ "step": 54200
9655
+ },
9656
+ {
9657
+ "epoch": 0.3649125639227596,
9658
+ "grad_norm": 0.1925128698348999,
9659
+ "learning_rate": 0.0009957224306869053,
9660
+ "loss": 3.0528,
9661
+ "num_input_tokens_seen": 14221312000,
9662
+ "step": 54250
9663
+ },
9664
+ {
9665
+ "epoch": 0.3652488888664672,
9666
+ "grad_norm": 0.2100643515586853,
9667
+ "learning_rate": 0.0009938441702975688,
9668
+ "loss": 3.0453,
9669
+ "num_input_tokens_seen": 14234419200,
9670
+ "step": 54300
9671
+ },
9672
+ {
9673
+ "epoch": 0.36558521381017484,
9674
+ "grad_norm": 0.46658360958099365,
9675
+ "learning_rate": 0.0009916274537819774,
9676
+ "loss": 3.0464,
9677
+ "num_input_tokens_seen": 14247526400,
9678
+ "step": 54350
9679
+ },
9680
+ {
9681
+ "epoch": 0.36592153875388245,
9682
+ "grad_norm": 0.19623732566833496,
9683
+ "learning_rate": 0.0009890738003669028,
9684
+ "loss": 3.0427,
9685
+ "num_input_tokens_seen": 14260633600,
9686
+ "step": 54400
9687
+ },
9688
+ {
9689
+ "epoch": 0.36625786369759006,
9690
+ "grad_norm": 0.24941138923168182,
9691
+ "learning_rate": 0.0009861849601988384,
9692
+ "loss": 3.0528,
9693
+ "num_input_tokens_seen": 14273740800,
9694
+ "step": 54450
9695
+ },
9696
+ {
9697
+ "epoch": 0.36659418864129767,
9698
+ "grad_norm": 0.22141198813915253,
9699
+ "learning_rate": 0.0009829629131445341,
9700
+ "loss": 3.0523,
9701
+ "num_input_tokens_seen": 14286848000,
9702
+ "step": 54500
9703
+ },
9704
+ {
9705
+ "epoch": 0.36659418864129767,
9706
+ "eval_loss": 2.9419288635253906,
9707
+ "eval_runtime": 53.6937,
9708
+ "eval_samples_per_second": 93.121,
9709
+ "eval_steps_per_second": 23.28,
9710
+ "num_input_tokens_seen": 14286848000,
9711
+ "step": 54500
9712
+ },
9713
+ {
9714
+ "epoch": 0.3669305135850053,
9715
+ "grad_norm": 0.2028401494026184,
9716
+ "learning_rate": 0.0009794098674340967,
9717
+ "loss": 3.0403,
9718
+ "num_input_tokens_seen": 14299955200,
9719
+ "step": 54550
9720
+ },
9721
+ {
9722
+ "epoch": 0.3672668385287129,
9723
+ "grad_norm": 0.20509253442287445,
9724
+ "learning_rate": 0.0009755282581475768,
9725
+ "loss": 3.0543,
9726
+ "num_input_tokens_seen": 14313062400,
9727
+ "step": 54600
9728
+ },
9729
+ {
9730
+ "epoch": 0.3676031634724205,
9731
+ "grad_norm": 1.2793521881103516,
9732
+ "learning_rate": 0.0009713207455460893,
9733
+ "loss": 3.0718,
9734
+ "num_input_tokens_seen": 14326169600,
9735
+ "step": 54650
9736
+ },
9737
+ {
9738
+ "epoch": 0.3679394884161281,
9739
+ "grad_norm": 1.1210218667984009,
9740
+ "learning_rate": 0.0009667902132486009,
9741
+ "loss": 3.0706,
9742
+ "num_input_tokens_seen": 14339276800,
9743
+ "step": 54700
9744
+ },
9745
+ {
9746
+ "epoch": 0.3682758133598357,
9747
+ "grad_norm": 0.5492864847183228,
9748
+ "learning_rate": 0.0009619397662556434,
9749
+ "loss": 3.0793,
9750
+ "num_input_tokens_seen": 14352384000,
9751
+ "step": 54750
9752
+ },
9753
+ {
9754
+ "epoch": 0.36861213830354334,
9755
+ "grad_norm": 0.34732338786125183,
9756
+ "learning_rate": 0.0009567727288213005,
9757
+ "loss": 3.0662,
9758
+ "num_input_tokens_seen": 14365491200,
9759
+ "step": 54800
9760
+ },
9761
+ {
9762
+ "epoch": 0.36894846324725095,
9763
+ "grad_norm": 0.2698073983192444,
9764
+ "learning_rate": 0.0009512926421749304,
9765
+ "loss": 3.0682,
9766
+ "num_input_tokens_seen": 14378598400,
9767
+ "step": 54850
9768
+ },
9769
+ {
9770
+ "epoch": 0.36928478819095856,
9771
+ "grad_norm": 0.593543529510498,
9772
+ "learning_rate": 0.0009455032620941839,
9773
+ "loss": 3.0507,
9774
+ "num_input_tokens_seen": 14391705600,
9775
+ "step": 54900
9776
+ },
9777
+ {
9778
+ "epoch": 0.36962111313466617,
9779
+ "grad_norm": 0.28389155864715576,
9780
+ "learning_rate": 0.0009394085563309827,
9781
+ "loss": 3.0593,
9782
+ "num_input_tokens_seen": 14404812800,
9783
+ "step": 54950
9784
+ },
9785
+ {
9786
+ "epoch": 0.3699574380783738,
9787
+ "grad_norm": 0.2569947838783264,
9788
+ "learning_rate": 0.0009330127018922195,
9789
+ "loss": 3.0524,
9790
+ "num_input_tokens_seen": 14417920000,
9791
+ "step": 55000
9792
+ },
9793
+ {
9794
+ "epoch": 0.3699574380783738,
9795
+ "eval_loss": 2.9468750953674316,
9796
+ "eval_runtime": 52.9661,
9797
+ "eval_samples_per_second": 94.4,
9798
+ "eval_steps_per_second": 23.6,
9799
+ "num_input_tokens_seen": 14417920000,
9800
+ "step": 55000
9801
  }
9802
  ],
9803
  "logging_steps": 50,
9804
  "max_steps": 60000,
9805
+ "num_input_tokens_seen": 14417920000,
9806
  "num_train_epochs": 1,
9807
  "save_steps": 1000,
9808
  "stateful_callbacks": {
 
9817
  "attributes": {}
9818
  }
9819
  },
9820
+ "total_flos": 3.8569343188992e+18,
9821
  "train_batch_size": 64,
9822
  "trial_name": null,
9823
  "trial_params": null