Azrail commited on
Commit
1063b27
·
verified ·
1 Parent(s): c8f570c

Training in progress, step 55000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa0a1572ea481edcf54695292d1afeb45339e9d2c4b988649cdfc68bb148006f
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5142916764b6385c48d096b2a7f336531a047dd5a1c0cd7b8aa09a2fdd35007
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88539436cc04b7ab674c4b703cc9d0b734fa709baeed11a8b4233a791dc8b00e
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c871f297ec758cbe8e1e4a52c756dfd036112baba8fbed3f20c9699d23ba9b0
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e516d1931a63763a7fdfb84f01f54aaada25beb218520b62969ba08ff897cee4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5eacfa99e53a8a1de73851121ef39f03223e9cc67398ac06a0e84e6dbf4ae3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b89459823d581d70469027e8df5427d5b9a07aadbd42c55eac43368b994e74e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaffe7b6e7bde964bb6e6784b39ca6209cca3589a90aff9795b02fa93025464e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2575813587416674,
6
  "eval_steps": 500,
7
- "global_step": 54000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9620,11 +9620,189 @@
9620
  "eval_steps_per_second": 23.204,
9621
  "num_input_tokens_seen": 14155771456,
9622
  "step": 54000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9623
  }
9624
  ],
9625
  "logging_steps": 50,
9626
  "max_steps": 70000,
9627
- "num_input_tokens_seen": 14155771456,
9628
  "num_train_epochs": 1,
9629
  "save_steps": 1000,
9630
  "stateful_callbacks": {
@@ -9639,7 +9817,7 @@
9639
  "attributes": {}
9640
  }
9641
  },
9642
- "total_flos": 3.7868070248094106e+18,
9643
  "train_batch_size": 64,
9644
  "trial_name": null,
9645
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2623513839035501,
6
  "eval_steps": 500,
7
+ "global_step": 55000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9620
  "eval_steps_per_second": 23.204,
9621
  "num_input_tokens_seen": 14155771456,
9622
  "step": 54000
9623
+ },
9624
+ {
9625
+ "epoch": 0.2578198599997615,
9626
+ "grad_norm": 0.2028844654560089,
9627
+ "learning_rate": 0.001,
9628
+ "loss": 2.6039,
9629
+ "num_input_tokens_seen": 14168878656,
9630
+ "step": 54050
9631
+ },
9632
+ {
9633
+ "epoch": 0.25805836125785564,
9634
+ "grad_norm": 0.19936658442020416,
9635
+ "learning_rate": 0.001,
9636
+ "loss": 2.5985,
9637
+ "num_input_tokens_seen": 14181985856,
9638
+ "step": 54100
9639
+ },
9640
+ {
9641
+ "epoch": 0.2582968625159498,
9642
+ "grad_norm": 0.2087993025779724,
9643
+ "learning_rate": 0.001,
9644
+ "loss": 2.62,
9645
+ "num_input_tokens_seen": 14195093056,
9646
+ "step": 54150
9647
+ },
9648
+ {
9649
+ "epoch": 0.2585353637740439,
9650
+ "grad_norm": 0.18972960114479065,
9651
+ "learning_rate": 0.001,
9652
+ "loss": 2.5936,
9653
+ "num_input_tokens_seen": 14208200256,
9654
+ "step": 54200
9655
+ },
9656
+ {
9657
+ "epoch": 0.25877386503213806,
9658
+ "grad_norm": 0.2162945419549942,
9659
+ "learning_rate": 0.001,
9660
+ "loss": 2.6125,
9661
+ "num_input_tokens_seen": 14221307456,
9662
+ "step": 54250
9663
+ },
9664
+ {
9665
+ "epoch": 0.25901236629023217,
9666
+ "grad_norm": 0.2538411319255829,
9667
+ "learning_rate": 0.001,
9668
+ "loss": 2.6197,
9669
+ "num_input_tokens_seen": 14234414656,
9670
+ "step": 54300
9671
+ },
9672
+ {
9673
+ "epoch": 0.2592508675483263,
9674
+ "grad_norm": 0.28060850501060486,
9675
+ "learning_rate": 0.001,
9676
+ "loss": 2.6194,
9677
+ "num_input_tokens_seen": 14247521856,
9678
+ "step": 54350
9679
+ },
9680
+ {
9681
+ "epoch": 0.25948936880642043,
9682
+ "grad_norm": 0.21557608246803284,
9683
+ "learning_rate": 0.001,
9684
+ "loss": 2.623,
9685
+ "num_input_tokens_seen": 14260629056,
9686
+ "step": 54400
9687
+ },
9688
+ {
9689
+ "epoch": 0.2597278700645146,
9690
+ "grad_norm": 0.21628426015377045,
9691
+ "learning_rate": 0.001,
9692
+ "loss": 2.6077,
9693
+ "num_input_tokens_seen": 14273736256,
9694
+ "step": 54450
9695
+ },
9696
+ {
9697
+ "epoch": 0.25996637132260875,
9698
+ "grad_norm": 0.19123327732086182,
9699
+ "learning_rate": 0.001,
9700
+ "loss": 2.5991,
9701
+ "num_input_tokens_seen": 14286843456,
9702
+ "step": 54500
9703
+ },
9704
+ {
9705
+ "epoch": 0.25996637132260875,
9706
+ "eval_loss": 2.4861645698547363,
9707
+ "eval_runtime": 53.6448,
9708
+ "eval_samples_per_second": 93.206,
9709
+ "eval_steps_per_second": 23.301,
9710
+ "num_input_tokens_seen": 14286843456,
9711
+ "step": 54500
9712
+ },
9713
+ {
9714
+ "epoch": 0.26020487258070285,
9715
+ "grad_norm": 0.20462968945503235,
9716
+ "learning_rate": 0.001,
9717
+ "loss": 2.5887,
9718
+ "num_input_tokens_seen": 14299950656,
9719
+ "step": 54550
9720
+ },
9721
+ {
9722
+ "epoch": 0.260443373838797,
9723
+ "grad_norm": 0.20952938497066498,
9724
+ "learning_rate": 0.001,
9725
+ "loss": 2.608,
9726
+ "num_input_tokens_seen": 14313057856,
9727
+ "step": 54600
9728
+ },
9729
+ {
9730
+ "epoch": 0.2606818750968911,
9731
+ "grad_norm": 0.2095402032136917,
9732
+ "learning_rate": 0.001,
9733
+ "loss": 2.6079,
9734
+ "num_input_tokens_seen": 14326165056,
9735
+ "step": 54650
9736
+ },
9737
+ {
9738
+ "epoch": 0.2609203763549853,
9739
+ "grad_norm": 0.2343517541885376,
9740
+ "learning_rate": 0.001,
9741
+ "loss": 2.6124,
9742
+ "num_input_tokens_seen": 14339272256,
9743
+ "step": 54700
9744
+ },
9745
+ {
9746
+ "epoch": 0.26115887761307943,
9747
+ "grad_norm": 0.23840700089931488,
9748
+ "learning_rate": 0.001,
9749
+ "loss": 2.6015,
9750
+ "num_input_tokens_seen": 14352379456,
9751
+ "step": 54750
9752
+ },
9753
+ {
9754
+ "epoch": 0.26139737887117354,
9755
+ "grad_norm": 0.22024671733379364,
9756
+ "learning_rate": 0.001,
9757
+ "loss": 2.5812,
9758
+ "num_input_tokens_seen": 14365486656,
9759
+ "step": 54800
9760
+ },
9761
+ {
9762
+ "epoch": 0.2616358801292677,
9763
+ "grad_norm": 0.19884246587753296,
9764
+ "learning_rate": 0.001,
9765
+ "loss": 2.6118,
9766
+ "num_input_tokens_seen": 14378593856,
9767
+ "step": 54850
9768
+ },
9769
+ {
9770
+ "epoch": 0.2618743813873618,
9771
+ "grad_norm": 0.46560585498809814,
9772
+ "learning_rate": 0.001,
9773
+ "loss": 2.6024,
9774
+ "num_input_tokens_seen": 14391701056,
9775
+ "step": 54900
9776
+ },
9777
+ {
9778
+ "epoch": 0.26211288264545596,
9779
+ "grad_norm": 0.2956256568431854,
9780
+ "learning_rate": 0.001,
9781
+ "loss": 2.6073,
9782
+ "num_input_tokens_seen": 14404808256,
9783
+ "step": 54950
9784
+ },
9785
+ {
9786
+ "epoch": 0.2623513839035501,
9787
+ "grad_norm": 0.286327064037323,
9788
+ "learning_rate": 0.001,
9789
+ "loss": 2.5946,
9790
+ "num_input_tokens_seen": 14417915456,
9791
+ "step": 55000
9792
+ },
9793
+ {
9794
+ "epoch": 0.2623513839035501,
9795
+ "eval_loss": 2.4892399311065674,
9796
+ "eval_runtime": 53.3184,
9797
+ "eval_samples_per_second": 93.776,
9798
+ "eval_steps_per_second": 23.444,
9799
+ "num_input_tokens_seen": 14417915456,
9800
+ "step": 55000
9801
  }
9802
  ],
9803
  "logging_steps": 50,
9804
  "max_steps": 70000,
9805
+ "num_input_tokens_seen": 14417915456,
9806
  "num_train_epochs": 1,
9807
  "save_steps": 1000,
9808
  "stateful_callbacks": {
 
9817
  "attributes": {}
9818
  }
9819
  },
9820
+ "total_flos": 3.8569331033348506e+18,
9821
  "train_batch_size": 64,
9822
  "trial_name": null,
9823
  "trial_params": null