Wilsonwin commited on
Commit
339d016
·
verified ·
1 Parent(s): f59c9e1

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feeb8af86d4228c031ab0303150253b8e59c08c82f4f8aa78a75fae604e120a1
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318c2656039c95a58242e4619aba90de89d286abfdd50c932ac46a5bbc6d6b36
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d5fa1cbde1c469de32a370ba5361ae4e7744a119f98350fc2511f131db06a4e
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fdbed07e432554d329c7e8d5c0f65220a1bfeee29ae26fa92a6aa0d5901ae56
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccd074c7b8f0b016dc440e87123ddc293303707dc1fa944c0ab62d0b20aa48bd
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5948a5161f7923aa0acf66b01adf35dc2196a8acf5bd2c21227561e5bff45666
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4784f3b1ac308d4093c525f58ebfb1ed5c4e7ca17828bd58e2e6a8e2baed20b5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53471871a37f3cc35b4a656a6f0cfda18046c304a91d9bf8b29b14eea2ccc156
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6050008447372868,
6
  "eval_steps": 500,
7
- "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6817,6 +6817,364 @@
6817
  "eval_samples_per_second": 279.306,
6818
  "eval_steps_per_second": 5.865,
6819
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6820
  }
6821
  ],
6822
  "logging_steps": 10,
@@ -6836,7 +7194,7 @@
6836
  "attributes": {}
6837
  }
6838
  },
6839
- "total_flos": 3.177318894608056e+17,
6840
  "train_batch_size": 48,
6841
  "trial_name": null,
6842
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.6894745734076704,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6817
  "eval_samples_per_second": 279.306,
6818
  "eval_steps_per_second": 5.865,
6819
  "step": 9500
6820
+ },
6821
+ {
6822
+ "epoch": 1.6066903193106943,
6823
+ "grad_norm": 0.4607177972793579,
6824
+ "learning_rate": 3.960727047894527e-05,
6825
+ "loss": 4.358008575439453,
6826
+ "step": 9510
6827
+ },
6828
+ {
6829
+ "epoch": 1.608379793884102,
6830
+ "grad_norm": 0.49898746609687805,
6831
+ "learning_rate": 3.928353538569023e-05,
6832
+ "loss": 4.323298645019531,
6833
+ "step": 9520
6834
+ },
6835
+ {
6836
+ "epoch": 1.6100692684575098,
6837
+ "grad_norm": 0.4633605182170868,
6838
+ "learning_rate": 3.8960929302853074e-05,
6839
+ "loss": 4.317881393432617,
6840
+ "step": 9530
6841
+ },
6842
+ {
6843
+ "epoch": 1.6117587430309173,
6844
+ "grad_norm": 0.461166650056839,
6845
+ "learning_rate": 3.863945552014892e-05,
6846
+ "loss": 4.31908073425293,
6847
+ "step": 9540
6848
+ },
6849
+ {
6850
+ "epoch": 1.6134482176043252,
6851
+ "grad_norm": 0.46390029788017273,
6852
+ "learning_rate": 3.831911731574648e-05,
6853
+ "loss": 4.363689804077149,
6854
+ "step": 9550
6855
+ },
6856
+ {
6857
+ "epoch": 1.6151376921777327,
6858
+ "grad_norm": 0.47450077533721924,
6859
+ "learning_rate": 3.799991795623471e-05,
6860
+ "loss": 4.329352569580078,
6861
+ "step": 9560
6862
+ },
6863
+ {
6864
+ "epoch": 1.6168271667511402,
6865
+ "grad_norm": 0.4686853587627411,
6866
+ "learning_rate": 3.7681860696589216e-05,
6867
+ "loss": 4.3315582275390625,
6868
+ "step": 9570
6869
+ },
6870
+ {
6871
+ "epoch": 1.6185166413245482,
6872
+ "grad_norm": 0.4681236445903778,
6873
+ "learning_rate": 3.7364948780139344e-05,
6874
+ "loss": 4.294339752197265,
6875
+ "step": 9580
6876
+ },
6877
+ {
6878
+ "epoch": 1.6202061158979557,
6879
+ "grad_norm": 0.47375062108039856,
6880
+ "learning_rate": 3.70491854385351e-05,
6881
+ "loss": 4.285346984863281,
6882
+ "step": 9590
6883
+ },
6884
+ {
6885
+ "epoch": 1.6218955904713634,
6886
+ "grad_norm": 0.4612501859664917,
6887
+ "learning_rate": 3.673457389171401e-05,
6888
+ "loss": 4.301979446411133,
6889
+ "step": 9600
6890
+ },
6891
+ {
6892
+ "epoch": 1.6235850650447712,
6893
+ "grad_norm": 0.4734920561313629,
6894
+ "learning_rate": 3.642111734786833e-05,
6895
+ "loss": 4.337078094482422,
6896
+ "step": 9610
6897
+ },
6898
+ {
6899
+ "epoch": 1.6252745396181787,
6900
+ "grad_norm": 0.48585888743400574,
6901
+ "learning_rate": 3.610881900341261e-05,
6902
+ "loss": 4.291253280639649,
6903
+ "step": 9620
6904
+ },
6905
+ {
6906
+ "epoch": 1.6269640141915864,
6907
+ "grad_norm": 0.4632498323917389,
6908
+ "learning_rate": 3.579768204295063e-05,
6909
+ "loss": 4.331230545043946,
6910
+ "step": 9630
6911
+ },
6912
+ {
6913
+ "epoch": 1.6286534887649942,
6914
+ "grad_norm": 0.46583032608032227,
6915
+ "learning_rate": 3.54877096392434e-05,
6916
+ "loss": 4.336456298828125,
6917
+ "step": 9640
6918
+ },
6919
+ {
6920
+ "epoch": 1.6303429633384017,
6921
+ "grad_norm": 0.4624863564968109,
6922
+ "learning_rate": 3.5178904953176354e-05,
6923
+ "loss": 4.305691146850586,
6924
+ "step": 9650
6925
+ },
6926
+ {
6927
+ "epoch": 1.6320324379118094,
6928
+ "grad_norm": 0.4653433859348297,
6929
+ "learning_rate": 3.487127113372755e-05,
6930
+ "loss": 4.32598648071289,
6931
+ "step": 9660
6932
+ },
6933
+ {
6934
+ "epoch": 1.6337219124852171,
6935
+ "grad_norm": 0.4744962453842163,
6936
+ "learning_rate": 3.4564811317935235e-05,
6937
+ "loss": 4.303342819213867,
6938
+ "step": 9670
6939
+ },
6940
+ {
6941
+ "epoch": 1.6354113870586247,
6942
+ "grad_norm": 0.4726518392562866,
6943
+ "learning_rate": 3.4259528630865995e-05,
6944
+ "loss": 4.328373718261719,
6945
+ "step": 9680
6946
+ },
6947
+ {
6948
+ "epoch": 1.6371008616320324,
6949
+ "grad_norm": 0.4716176390647888,
6950
+ "learning_rate": 3.3955426185582826e-05,
6951
+ "loss": 4.309525680541992,
6952
+ "step": 9690
6953
+ },
6954
+ {
6955
+ "epoch": 1.6387903362054401,
6956
+ "grad_norm": 0.4661267399787903,
6957
+ "learning_rate": 3.365250708311352e-05,
6958
+ "loss": 4.324785232543945,
6959
+ "step": 9700
6960
+ },
6961
+ {
6962
+ "epoch": 1.6404798107788476,
6963
+ "grad_norm": 0.46032196283340454,
6964
+ "learning_rate": 3.335077441241895e-05,
6965
+ "loss": 4.306519317626953,
6966
+ "step": 9710
6967
+ },
6968
+ {
6969
+ "epoch": 1.6421692853522556,
6970
+ "grad_norm": 0.5246592164039612,
6971
+ "learning_rate": 3.305023125036148e-05,
6972
+ "loss": 4.312277221679688,
6973
+ "step": 9720
6974
+ },
6975
+ {
6976
+ "epoch": 1.643858759925663,
6977
+ "grad_norm": 0.46025800704956055,
6978
+ "learning_rate": 3.275088066167369e-05,
6979
+ "loss": 4.307319259643554,
6980
+ "step": 9730
6981
+ },
6982
+ {
6983
+ "epoch": 1.6455482344990708,
6984
+ "grad_norm": 0.47664591670036316,
6985
+ "learning_rate": 3.245272569892727e-05,
6986
+ "loss": 4.350948333740234,
6987
+ "step": 9740
6988
+ },
6989
+ {
6990
+ "epoch": 1.6472377090724786,
6991
+ "grad_norm": 0.46211037039756775,
6992
+ "learning_rate": 3.215576940250155e-05,
6993
+ "loss": 4.310560607910157,
6994
+ "step": 9750
6995
+ },
6996
+ {
6997
+ "epoch": 1.648927183645886,
6998
+ "grad_norm": 0.4830545485019684,
6999
+ "learning_rate": 3.1860014800552734e-05,
7000
+ "loss": 4.30987777709961,
7001
+ "step": 9760
7002
+ },
7003
+ {
7004
+ "epoch": 1.6506166582192938,
7005
+ "grad_norm": 0.4861840605735779,
7006
+ "learning_rate": 3.15654649089831e-05,
7007
+ "loss": 4.3120475769042965,
7008
+ "step": 9770
7009
+ },
7010
+ {
7011
+ "epoch": 1.6523061327927016,
7012
+ "grad_norm": 0.5054605603218079,
7013
+ "learning_rate": 3.1272122731409916e-05,
7014
+ "loss": 4.325033569335938,
7015
+ "step": 9780
7016
+ },
7017
+ {
7018
+ "epoch": 1.653995607366109,
7019
+ "grad_norm": 0.46032124757766724,
7020
+ "learning_rate": 3.097999125913518e-05,
7021
+ "loss": 4.310620880126953,
7022
+ "step": 9790
7023
+ },
7024
+ {
7025
+ "epoch": 1.6556850819395168,
7026
+ "grad_norm": 0.4689234495162964,
7027
+ "learning_rate": 3.068907347111485e-05,
7028
+ "loss": 4.30926513671875,
7029
+ "step": 9800
7030
+ },
7031
+ {
7032
+ "epoch": 1.6573745565129245,
7033
+ "grad_norm": 0.47660669684410095,
7034
+ "learning_rate": 3.0399372333928644e-05,
7035
+ "loss": 4.313259887695312,
7036
+ "step": 9810
7037
+ },
7038
+ {
7039
+ "epoch": 1.659064031086332,
7040
+ "grad_norm": 0.48029860854148865,
7041
+ "learning_rate": 3.0110890801749627e-05,
7042
+ "loss": 4.307758331298828,
7043
+ "step": 9820
7044
+ },
7045
+ {
7046
+ "epoch": 1.6607535056597398,
7047
+ "grad_norm": 0.46481746435165405,
7048
+ "learning_rate": 2.982363181631418e-05,
7049
+ "loss": 4.303005981445312,
7050
+ "step": 9830
7051
+ },
7052
+ {
7053
+ "epoch": 1.6624429802331475,
7054
+ "grad_norm": 0.4820667505264282,
7055
+ "learning_rate": 2.9537598306892103e-05,
7056
+ "loss": 4.307665634155273,
7057
+ "step": 9840
7058
+ },
7059
+ {
7060
+ "epoch": 1.664132454806555,
7061
+ "grad_norm": 0.4749463200569153,
7062
+ "learning_rate": 2.9252793190256447e-05,
7063
+ "loss": 4.284444427490234,
7064
+ "step": 9850
7065
+ },
7066
+ {
7067
+ "epoch": 1.665821929379963,
7068
+ "grad_norm": 0.46186140179634094,
7069
+ "learning_rate": 2.896921937065419e-05,
7070
+ "loss": 4.313379287719727,
7071
+ "step": 9860
7072
+ },
7073
+ {
7074
+ "epoch": 1.6675114039533705,
7075
+ "grad_norm": 0.45953449606895447,
7076
+ "learning_rate": 2.8686879739776137e-05,
7077
+ "loss": 4.316988754272461,
7078
+ "step": 9870
7079
+ },
7080
+ {
7081
+ "epoch": 1.669200878526778,
7082
+ "grad_norm": 0.4738609194755554,
7083
+ "learning_rate": 2.8405777176727924e-05,
7084
+ "loss": 4.317482376098633,
7085
+ "step": 9880
7086
+ },
7087
+ {
7088
+ "epoch": 1.670890353100186,
7089
+ "grad_norm": 0.46274814009666443,
7090
+ "learning_rate": 2.8125914548000243e-05,
7091
+ "loss": 4.294824600219727,
7092
+ "step": 9890
7093
+ },
7094
+ {
7095
+ "epoch": 1.6725798276735935,
7096
+ "grad_norm": 0.47534388303756714,
7097
+ "learning_rate": 2.7847294707439828e-05,
7098
+ "loss": 4.28771743774414,
7099
+ "step": 9900
7100
+ },
7101
+ {
7102
+ "epoch": 1.6742693022470012,
7103
+ "grad_norm": 0.4873548150062561,
7104
+ "learning_rate": 2.7569920496220398e-05,
7105
+ "loss": 4.304574584960937,
7106
+ "step": 9910
7107
+ },
7108
+ {
7109
+ "epoch": 1.675958776820409,
7110
+ "grad_norm": 0.46979424357414246,
7111
+ "learning_rate": 2.729379474281352e-05,
7112
+ "loss": 4.303669738769531,
7113
+ "step": 9920
7114
+ },
7115
+ {
7116
+ "epoch": 1.6776482513938165,
7117
+ "grad_norm": 0.5028051733970642,
7118
+ "learning_rate": 2.701892026295979e-05,
7119
+ "loss": 4.331151962280273,
7120
+ "step": 9930
7121
+ },
7122
+ {
7123
+ "epoch": 1.6793377259672242,
7124
+ "grad_norm": 0.46676331758499146,
7125
+ "learning_rate": 2.6745299859640318e-05,
7126
+ "loss": 4.332028198242187,
7127
+ "step": 9940
7128
+ },
7129
+ {
7130
+ "epoch": 1.681027200540632,
7131
+ "grad_norm": 0.4791058897972107,
7132
+ "learning_rate": 2.6472936323047972e-05,
7133
+ "loss": 4.310791778564453,
7134
+ "step": 9950
7135
+ },
7136
+ {
7137
+ "epoch": 1.6827166751140394,
7138
+ "grad_norm": 0.4680987298488617,
7139
+ "learning_rate": 2.6201832430558866e-05,
7140
+ "loss": 4.313525390625,
7141
+ "step": 9960
7142
+ },
7143
+ {
7144
+ "epoch": 1.6844061496874472,
7145
+ "grad_norm": 0.47215357422828674,
7146
+ "learning_rate": 2.5931990946704206e-05,
7147
+ "loss": 4.311981582641602,
7148
+ "step": 9970
7149
+ },
7150
+ {
7151
+ "epoch": 1.686095624260855,
7152
+ "grad_norm": 0.4663841128349304,
7153
+ "learning_rate": 2.5663414623141943e-05,
7154
+ "loss": 4.314894485473633,
7155
+ "step": 9980
7156
+ },
7157
+ {
7158
+ "epoch": 1.6877850988342624,
7159
+ "grad_norm": 0.4573664367198944,
7160
+ "learning_rate": 2.5396106198628947e-05,
7161
+ "loss": 4.316466903686523,
7162
+ "step": 9990
7163
+ },
7164
+ {
7165
+ "epoch": 1.6894745734076704,
7166
+ "grad_norm": 0.4708999693393707,
7167
+ "learning_rate": 2.5130068398992716e-05,
7168
+ "loss": 4.313570404052735,
7169
+ "step": 10000
7170
+ },
7171
+ {
7172
+ "epoch": 1.6894745734076704,
7173
+ "eval_loss": 4.262009620666504,
7174
+ "eval_runtime": 3.6505,
7175
+ "eval_samples_per_second": 273.932,
7176
+ "eval_steps_per_second": 5.753,
7177
+ "step": 10000
7178
  }
7179
  ],
7180
  "logging_steps": 10,
 
7194
  "attributes": {}
7195
  }
7196
  },
7197
+ "total_flos": 3.344547305037496e+17,
7198
  "train_batch_size": 48,
7199
  "trial_name": null,
7200
  "trial_params": null