Azrail commited on
Commit
6dcd8b6
·
verified ·
1 Parent(s): b3f521b

Training in progress, step 39000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a94fdb7295accdf7201fc02029e5ae45ac44dcd9ef16798d70e0f488636e1f9c
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d9de77e169ab6755eee5c1e2686926ed90a44667e2ca5eaaa214d0bfb470d0
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d665c90e6bb04abc526448689a0e7c0f687d8ae7453c9e6c300bb8f38a3b48a2
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32d052133c3436978400ad1196231762f190f602db77149c4cd7cd33bf55ce04
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:067fc834db8daa6bcb7d646c19fb2debac62a3ca3a0f0e8b29d38f87eb5e83ea
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25ff3bb999ac5f8f98fec3e5d0ee521c3ada6460eb2706bfa5386f4fa0d04e58
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b9f11080dc8caed5e3c50cfbb46586ae36896ccfee6afab64bede4080bf44b1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0453f791eed05815dae518781dc172eec5529318c2577a889d73a15d6a871e53
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8347096432302714,
6
  "eval_steps": 500,
7
- "global_step": 38000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6772,11 +6772,189 @@
6772
  "eval_steps_per_second": 18.833,
6773
  "num_input_tokens_seen": 39845884160,
6774
  "step": 38000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6775
  }
6776
  ],
6777
  "logging_steps": 50,
6778
  "max_steps": 200000,
6779
- "num_input_tokens_seen": 39845884160,
6780
  "num_train_epochs": 5,
6781
  "save_steps": 1000,
6782
  "stateful_callbacks": {
@@ -6791,7 +6969,7 @@
6791
  "attributes": {}
6792
  }
6793
  },
6794
- "total_flos": 2.2692523019759124e+19,
6795
  "train_batch_size": 64,
6796
  "trial_name": null,
6797
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8566756864731733,
6
  "eval_steps": 500,
7
+ "global_step": 39000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6772
  "eval_steps_per_second": 18.833,
6773
  "num_input_tokens_seen": 39845884160,
6774
  "step": 38000
6775
+ },
6776
+ {
6777
+ "epoch": 0.8358079453924165,
6778
+ "grad_norm": 0.15346269309520721,
6779
+ "learning_rate": 0.001,
6780
+ "loss": 2.645,
6781
+ "num_input_tokens_seen": 39898312960,
6782
+ "step": 38050
6783
+ },
6784
+ {
6785
+ "epoch": 0.8369062475545616,
6786
+ "grad_norm": 0.1504630148410797,
6787
+ "learning_rate": 0.001,
6788
+ "loss": 2.666,
6789
+ "num_input_tokens_seen": 39950741760,
6790
+ "step": 38100
6791
+ },
6792
+ {
6793
+ "epoch": 0.8380045497167067,
6794
+ "grad_norm": 0.19098903238773346,
6795
+ "learning_rate": 0.001,
6796
+ "loss": 2.6649,
6797
+ "num_input_tokens_seen": 40003170560,
6798
+ "step": 38150
6799
+ },
6800
+ {
6801
+ "epoch": 0.8391028518788518,
6802
+ "grad_norm": 0.15553973615169525,
6803
+ "learning_rate": 0.001,
6804
+ "loss": 2.6565,
6805
+ "num_input_tokens_seen": 40055599360,
6806
+ "step": 38200
6807
+ },
6808
+ {
6809
+ "epoch": 0.8402011540409968,
6810
+ "grad_norm": 0.15650159120559692,
6811
+ "learning_rate": 0.001,
6812
+ "loss": 2.6568,
6813
+ "num_input_tokens_seen": 40108028160,
6814
+ "step": 38250
6815
+ },
6816
+ {
6817
+ "epoch": 0.841299456203142,
6818
+ "grad_norm": 0.17787836492061615,
6819
+ "learning_rate": 0.001,
6820
+ "loss": 2.6497,
6821
+ "num_input_tokens_seen": 40160456960,
6822
+ "step": 38300
6823
+ },
6824
+ {
6825
+ "epoch": 0.8423977583652871,
6826
+ "grad_norm": 0.1535162478685379,
6827
+ "learning_rate": 0.001,
6828
+ "loss": 2.6492,
6829
+ "num_input_tokens_seen": 40212885760,
6830
+ "step": 38350
6831
+ },
6832
+ {
6833
+ "epoch": 0.8434960605274322,
6834
+ "grad_norm": 0.16713359951972961,
6835
+ "learning_rate": 0.001,
6836
+ "loss": 2.6534,
6837
+ "num_input_tokens_seen": 40265314560,
6838
+ "step": 38400
6839
+ },
6840
+ {
6841
+ "epoch": 0.8445943626895772,
6842
+ "grad_norm": 0.17087998986244202,
6843
+ "learning_rate": 0.001,
6844
+ "loss": 2.6602,
6845
+ "num_input_tokens_seen": 40317743360,
6846
+ "step": 38450
6847
+ },
6848
+ {
6849
+ "epoch": 0.8456926648517223,
6850
+ "grad_norm": 0.15651412308216095,
6851
+ "learning_rate": 0.001,
6852
+ "loss": 2.6547,
6853
+ "num_input_tokens_seen": 40370172160,
6854
+ "step": 38500
6855
+ },
6856
+ {
6857
+ "epoch": 0.8456926648517223,
6858
+ "eval_loss": 2.5524706840515137,
6859
+ "eval_runtime": 66.5023,
6860
+ "eval_samples_per_second": 75.185,
6861
+ "eval_steps_per_second": 18.796,
6862
+ "num_input_tokens_seen": 40370172160,
6863
+ "step": 38500
6864
+ },
6865
+ {
6866
+ "epoch": 0.8467909670138675,
6867
+ "grad_norm": 0.15205898880958557,
6868
+ "learning_rate": 0.001,
6869
+ "loss": 2.6541,
6870
+ "num_input_tokens_seen": 40422600960,
6871
+ "step": 38550
6872
+ },
6873
+ {
6874
+ "epoch": 0.8478892691760125,
6875
+ "grad_norm": 0.15865832567214966,
6876
+ "learning_rate": 0.001,
6877
+ "loss": 2.6536,
6878
+ "num_input_tokens_seen": 40475029760,
6879
+ "step": 38600
6880
+ },
6881
+ {
6882
+ "epoch": 0.8489875713381576,
6883
+ "grad_norm": 0.133284330368042,
6884
+ "learning_rate": 0.001,
6885
+ "loss": 2.6531,
6886
+ "num_input_tokens_seen": 40527458560,
6887
+ "step": 38650
6888
+ },
6889
+ {
6890
+ "epoch": 0.8500858735003027,
6891
+ "grad_norm": 0.1421806663274765,
6892
+ "learning_rate": 0.001,
6893
+ "loss": 2.6558,
6894
+ "num_input_tokens_seen": 40579887360,
6895
+ "step": 38700
6896
+ },
6897
+ {
6898
+ "epoch": 0.8511841756624479,
6899
+ "grad_norm": 0.19429996609687805,
6900
+ "learning_rate": 0.001,
6901
+ "loss": 2.6628,
6902
+ "num_input_tokens_seen": 40632316160,
6903
+ "step": 38750
6904
+ },
6905
+ {
6906
+ "epoch": 0.8522824778245929,
6907
+ "grad_norm": 0.14661937952041626,
6908
+ "learning_rate": 0.001,
6909
+ "loss": 2.6594,
6910
+ "num_input_tokens_seen": 40684744960,
6911
+ "step": 38800
6912
+ },
6913
+ {
6914
+ "epoch": 0.853380779986738,
6915
+ "grad_norm": 0.1694687008857727,
6916
+ "learning_rate": 0.001,
6917
+ "loss": 2.6571,
6918
+ "num_input_tokens_seen": 40737173760,
6919
+ "step": 38850
6920
+ },
6921
+ {
6922
+ "epoch": 0.8544790821488831,
6923
+ "grad_norm": 0.152188241481781,
6924
+ "learning_rate": 0.001,
6925
+ "loss": 2.6534,
6926
+ "num_input_tokens_seen": 40789602560,
6927
+ "step": 38900
6928
+ },
6929
+ {
6930
+ "epoch": 0.8555773843110281,
6931
+ "grad_norm": 0.1554640680551529,
6932
+ "learning_rate": 0.001,
6933
+ "loss": 2.649,
6934
+ "num_input_tokens_seen": 40842031360,
6935
+ "step": 38950
6936
+ },
6937
+ {
6938
+ "epoch": 0.8566756864731733,
6939
+ "grad_norm": 0.1481955647468567,
6940
+ "learning_rate": 0.001,
6941
+ "loss": 2.6527,
6942
+ "num_input_tokens_seen": 40894460160,
6943
+ "step": 39000
6944
+ },
6945
+ {
6946
+ "epoch": 0.8566756864731733,
6947
+ "eval_loss": 2.547664165496826,
6948
+ "eval_runtime": 66.2874,
6949
+ "eval_samples_per_second": 75.429,
6950
+ "eval_steps_per_second": 18.857,
6951
+ "num_input_tokens_seen": 40894460160,
6952
+ "step": 39000
6953
  }
6954
  ],
6955
  "logging_steps": 50,
6956
  "max_steps": 200000,
6957
+ "num_input_tokens_seen": 40894460160,
6958
  "num_train_epochs": 5,
6959
  "save_steps": 1000,
6960
  "stateful_callbacks": {
 
6969
  "attributes": {}
6970
  }
6971
  },
6972
+ "total_flos": 2.3289694735724052e+19,
6973
  "train_batch_size": 64,
6974
  "trial_name": null,
6975
  "trial_params": null