minpeter commited on
Commit
af26083
·
verified ·
1 Parent(s): 33c00d1

Training in progress, step 21000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df03cae2dd432c211456aab943782bf83ba84e08565c4c981659cb89c83a578e
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba9d75a78fad20f4b1e389f6c85dda0f453be86d800ed2eba32953160cc02033
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4292287a7fa690fe53e7b389faee8373877f88d995cc45d3321aeb77bf8c4af6
3
  size 209816139
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df2f641838670afd6d1bb0181e8efde74cebba7ddaeaad933397844d1eb9afb6
3
  size 209816139
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22e8bb13b8b5cd110e015717953ca96d5c03c35ddfe30ca45c1fab9651d07421
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eda74d083cd5d9b07d403914b5a235c44dd87bc93a29636e940f36b95f8743f9
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76ace0471241ab08ffd32878e593821b741d6b0b68bcb601ea44671e5ef83eef
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f1feed6ec98326449107f6ac06aad035f8176b90aa697c6edf6a509039a50c
3
  size 14917
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba4436ed0869bacf238e760f8e2f2044a22ff86693a77a3015046ef89f00fc7e
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6645e7dc37725bbae83eaf70fb81001a75be54d9a6554f43743dfb20cfc0984
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
- "epoch": 0.06246213233227356,
6
  "eval_steps": 1000,
7
- "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5768,6 +5768,294 @@
5768
  "eval_samples_per_second": 50.82,
5769
  "eval_steps_per_second": 3.184,
5770
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5771
  }
5772
  ],
5773
  "logging_steps": 25,
@@ -5787,7 +6075,7 @@
5787
  "attributes": {}
5788
  }
5789
  },
5790
- "total_flos": 2.53630733446493e+18,
5791
  "train_batch_size": 8,
5792
  "trial_name": null,
5793
  "trial_params": null
 
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
+ "epoch": 0.06558523894888724,
6
  "eval_steps": 1000,
7
+ "global_step": 21000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5768
  "eval_samples_per_second": 50.82,
5769
  "eval_steps_per_second": 3.184,
5770
  "step": 20000
5771
+ },
5772
+ {
5773
+ "epoch": 0.0625402099976889,
5774
+ "grad_norm": 50.0,
5775
+ "learning_rate": 0.000999570404664504,
5776
+ "loss": 34.3706,
5777
+ "step": 20025
5778
+ },
5779
+ {
5780
+ "epoch": 0.06261828766310425,
5781
+ "grad_norm": 45.75,
5782
+ "learning_rate": 0.0009995650375662492,
5783
+ "loss": 34.1775,
5784
+ "step": 20050
5785
+ },
5786
+ {
5787
+ "epoch": 0.06269636532851959,
5788
+ "grad_norm": 43.5,
5789
+ "learning_rate": 0.0009995596371637897,
5790
+ "loss": 34.3327,
5791
+ "step": 20075
5792
+ },
5793
+ {
5794
+ "epoch": 0.06277444299393492,
5795
+ "grad_norm": 43.25,
5796
+ "learning_rate": 0.0009995542034574863,
5797
+ "loss": 34.3871,
5798
+ "step": 20100
5799
+ },
5800
+ {
5801
+ "epoch": 0.06285252065935026,
5802
+ "grad_norm": 42.75,
5803
+ "learning_rate": 0.0009995487364477004,
5804
+ "loss": 33.8116,
5805
+ "step": 20125
5806
+ },
5807
+ {
5808
+ "epoch": 0.06293059832476561,
5809
+ "grad_norm": 37.5,
5810
+ "learning_rate": 0.0009995432361347971,
5811
+ "loss": 33.9015,
5812
+ "step": 20150
5813
+ },
5814
+ {
5815
+ "epoch": 0.06300867599018095,
5816
+ "grad_norm": 38.5,
5817
+ "learning_rate": 0.0009995377025191427,
5818
+ "loss": 33.8639,
5819
+ "step": 20175
5820
+ },
5821
+ {
5822
+ "epoch": 0.0630867536555963,
5823
+ "grad_norm": 37.25,
5824
+ "learning_rate": 0.0009995321356011063,
5825
+ "loss": 33.6663,
5826
+ "step": 20200
5827
+ },
5828
+ {
5829
+ "epoch": 0.06316483132101164,
5830
+ "grad_norm": 40.5,
5831
+ "learning_rate": 0.0009995265353810589,
5832
+ "loss": 33.8264,
5833
+ "step": 20225
5834
+ },
5835
+ {
5836
+ "epoch": 0.06324290898642698,
5837
+ "grad_norm": 45.25,
5838
+ "learning_rate": 0.0009995209018593737,
5839
+ "loss": 33.6851,
5840
+ "step": 20250
5841
+ },
5842
+ {
5843
+ "epoch": 0.06332098665184233,
5844
+ "grad_norm": 42.0,
5845
+ "learning_rate": 0.0009995152350364266,
5846
+ "loss": 33.5799,
5847
+ "step": 20275
5848
+ },
5849
+ {
5850
+ "epoch": 0.06339906431725766,
5851
+ "grad_norm": 43.25,
5852
+ "learning_rate": 0.000999509534912595,
5853
+ "loss": 33.6905,
5854
+ "step": 20300
5855
+ },
5856
+ {
5857
+ "epoch": 0.063477141982673,
5858
+ "grad_norm": 37.25,
5859
+ "learning_rate": 0.0009995038014882593,
5860
+ "loss": 33.4839,
5861
+ "step": 20325
5862
+ },
5863
+ {
5864
+ "epoch": 0.06355521964808834,
5865
+ "grad_norm": 35.75,
5866
+ "learning_rate": 0.0009994980347638016,
5867
+ "loss": 33.6105,
5868
+ "step": 20350
5869
+ },
5870
+ {
5871
+ "epoch": 0.06363329731350369,
5872
+ "grad_norm": 38.0,
5873
+ "learning_rate": 0.0009994922347396063,
5874
+ "loss": 33.9047,
5875
+ "step": 20375
5876
+ },
5877
+ {
5878
+ "epoch": 0.06371137497891903,
5879
+ "grad_norm": 40.25,
5880
+ "learning_rate": 0.00099948640141606,
5881
+ "loss": 34.1876,
5882
+ "step": 20400
5883
+ },
5884
+ {
5885
+ "epoch": 0.06378945264433437,
5886
+ "grad_norm": 45.75,
5887
+ "learning_rate": 0.0009994805347935517,
5888
+ "loss": 33.9303,
5889
+ "step": 20425
5890
+ },
5891
+ {
5892
+ "epoch": 0.06386753030974972,
5893
+ "grad_norm": 42.75,
5894
+ "learning_rate": 0.0009994746348724727,
5895
+ "loss": 33.951,
5896
+ "step": 20450
5897
+ },
5898
+ {
5899
+ "epoch": 0.06394560797516506,
5900
+ "grad_norm": 50.0,
5901
+ "learning_rate": 0.000999468701653216,
5902
+ "loss": 34.056,
5903
+ "step": 20475
5904
+ },
5905
+ {
5906
+ "epoch": 0.0640236856405804,
5907
+ "grad_norm": 50.5,
5908
+ "learning_rate": 0.0009994627351361772,
5909
+ "loss": 33.9114,
5910
+ "step": 20500
5911
+ },
5912
+ {
5913
+ "epoch": 0.06410176330599573,
5914
+ "grad_norm": 42.25,
5915
+ "learning_rate": 0.0009994567353217541,
5916
+ "loss": 34.2422,
5917
+ "step": 20525
5918
+ },
5919
+ {
5920
+ "epoch": 0.06417984097141108,
5921
+ "grad_norm": 44.25,
5922
+ "learning_rate": 0.0009994507022103465,
5923
+ "loss": 34.0631,
5924
+ "step": 20550
5925
+ },
5926
+ {
5927
+ "epoch": 0.06425791863682642,
5928
+ "grad_norm": 39.75,
5929
+ "learning_rate": 0.000999444635802357,
5930
+ "loss": 33.8447,
5931
+ "step": 20575
5932
+ },
5933
+ {
5934
+ "epoch": 0.06433599630224177,
5935
+ "grad_norm": 44.75,
5936
+ "learning_rate": 0.00099943853609819,
5937
+ "loss": 33.8587,
5938
+ "step": 20600
5939
+ },
5940
+ {
5941
+ "epoch": 0.06441407396765711,
5942
+ "grad_norm": 39.25,
5943
+ "learning_rate": 0.0009994324030982518,
5944
+ "loss": 33.943,
5945
+ "step": 20625
5946
+ },
5947
+ {
5948
+ "epoch": 0.06449215163307245,
5949
+ "grad_norm": 41.75,
5950
+ "learning_rate": 0.0009994262368029515,
5951
+ "loss": 33.9425,
5952
+ "step": 20650
5953
+ },
5954
+ {
5955
+ "epoch": 0.0645702292984878,
5956
+ "grad_norm": 44.5,
5957
+ "learning_rate": 0.0009994200372127,
5958
+ "loss": 34.0832,
5959
+ "step": 20675
5960
+ },
5961
+ {
5962
+ "epoch": 0.06464830696390314,
5963
+ "grad_norm": 39.25,
5964
+ "learning_rate": 0.000999413804327911,
5965
+ "loss": 33.9888,
5966
+ "step": 20700
5967
+ },
5968
+ {
5969
+ "epoch": 0.06472638462931847,
5970
+ "grad_norm": 43.75,
5971
+ "learning_rate": 0.0009994075381489994,
5972
+ "loss": 34.1022,
5973
+ "step": 20725
5974
+ },
5975
+ {
5976
+ "epoch": 0.06480446229473381,
5977
+ "grad_norm": 44.25,
5978
+ "learning_rate": 0.0009994012386763836,
5979
+ "loss": 33.9719,
5980
+ "step": 20750
5981
+ },
5982
+ {
5983
+ "epoch": 0.06488253996014916,
5984
+ "grad_norm": 42.0,
5985
+ "learning_rate": 0.000999394905910483,
5986
+ "loss": 33.7568,
5987
+ "step": 20775
5988
+ },
5989
+ {
5990
+ "epoch": 0.0649606176255645,
5991
+ "grad_norm": 43.75,
5992
+ "learning_rate": 0.0009993885398517201,
5993
+ "loss": 33.7079,
5994
+ "step": 20800
5995
+ },
5996
+ {
5997
+ "epoch": 0.06503869529097984,
5998
+ "grad_norm": 40.0,
5999
+ "learning_rate": 0.0009993821405005195,
6000
+ "loss": 33.8396,
6001
+ "step": 20825
6002
+ },
6003
+ {
6004
+ "epoch": 0.06511677295639519,
6005
+ "grad_norm": 42.5,
6006
+ "learning_rate": 0.0009993757078573073,
6007
+ "loss": 33.6027,
6008
+ "step": 20850
6009
+ },
6010
+ {
6011
+ "epoch": 0.06519485062181053,
6012
+ "grad_norm": 42.5,
6013
+ "learning_rate": 0.0009993692419225126,
6014
+ "loss": 33.5388,
6015
+ "step": 20875
6016
+ },
6017
+ {
6018
+ "epoch": 0.06527292828722588,
6019
+ "grad_norm": 55.0,
6020
+ "learning_rate": 0.0009993627426965667,
6021
+ "loss": 33.775,
6022
+ "step": 20900
6023
+ },
6024
+ {
6025
+ "epoch": 0.0653510059526412,
6026
+ "grad_norm": 39.0,
6027
+ "learning_rate": 0.0009993562101799024,
6028
+ "loss": 33.8984,
6029
+ "step": 20925
6030
+ },
6031
+ {
6032
+ "epoch": 0.06542908361805655,
6033
+ "grad_norm": 41.5,
6034
+ "learning_rate": 0.0009993496443729557,
6035
+ "loss": 33.8582,
6036
+ "step": 20950
6037
+ },
6038
+ {
6039
+ "epoch": 0.06550716128347189,
6040
+ "grad_norm": 37.25,
6041
+ "learning_rate": 0.0009993430452761639,
6042
+ "loss": 33.8915,
6043
+ "step": 20975
6044
+ },
6045
+ {
6046
+ "epoch": 0.06558523894888724,
6047
+ "grad_norm": 35.0,
6048
+ "learning_rate": 0.0009993364128899672,
6049
+ "loss": 33.5705,
6050
+ "step": 21000
6051
+ },
6052
+ {
6053
+ "epoch": 0.06558523894888724,
6054
+ "eval_loss": 33.73247146606445,
6055
+ "eval_runtime": 102.3252,
6056
+ "eval_samples_per_second": 50.848,
6057
+ "eval_steps_per_second": 3.186,
6058
+ "step": 21000
6059
  }
6060
  ],
6061
  "logging_steps": 25,
 
6075
  "attributes": {}
6076
  }
6077
  },
6078
+ "total_flos": 2.663111367480836e+18,
6079
  "train_batch_size": 8,
6080
  "trial_name": null,
6081
  "trial_params": null