kiritan commited on
Commit
417c0ce
·
verified ·
1 Parent(s): 63a7d61

Training in progress, step 20000, checkpoint

Browse files
last-checkpoint/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ade9c3e43b2a1550492ecf4b91e9228af429dcf0d7b1c09aea81ebc7a5842d20
3
- size 761059696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9867ea7f15881e7bed68bb3b7781fc3c4f5646e0a9aec63231d97c009a1c403f
3
+ size 5117197020
last-checkpoint/global_step20000/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58f46b37e83d56bff8e8b49fc01d48e56f7c2f6034abd01b65de03f862980853
3
- size 129965712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d8bf456bdf7cfa2a6fed84991a6fc983b6fea67864bf0474df258a8f8c7541
3
+ size 859127504
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step17000
 
1
+ global_step20000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa4f82ea40fb0305db931cf7a54215d8c646ba708abad07172d476a907b2dad4
3
  size 962205216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069fbc2b96ff55558de2b6621d0406b4fbcbc7edffe8d2472bb8b992e0abdb14
3
  size 962205216
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be21ff914d7590ad2180b18bca69f62255c4deee5c5c2b727794908b9d148dcc
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2e75134b208d60f6f9b30cef29e49813797dfcda4ce7d7e2cabca76bb3fa47
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4aa6830d6aa63edbea9a9fa4aac3b79365984a3d18eed4b014dcec7309b75dc2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29c7a79b53a589de48d3b7a21df9c0d024be4dea79f68869f72fdc01ae3b212a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 82.65912305516267,
3
- "best_model_checkpoint": "./iteboshi_temp/checkpoint-16000",
4
- "epoch": 18.722466960352424,
5
  "eval_steps": 1000,
6
- "global_step": 17000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4937,6 +4937,876 @@
4937
  "eval_steps_per_second": 1.554,
4938
  "eval_wer": 82.998585572843,
4939
  "step": 17000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4940
  }
4941
  ],
4942
  "logging_steps": 25,
@@ -4951,12 +5821,12 @@
4951
  "should_evaluate": false,
4952
  "should_log": false,
4953
  "should_save": true,
4954
- "should_training_stop": false
4955
  },
4956
  "attributes": {}
4957
  }
4958
  },
4959
- "total_flos": 2.920843417033166e+20,
4960
  "train_batch_size": 4,
4961
  "trial_name": null,
4962
  "trial_params": null
 
1
  {
2
+ "best_metric": 82.34794908062236,
3
+ "best_model_checkpoint": "./iteboshi_temp/checkpoint-20000",
4
+ "epoch": 22.026431718061673,
5
  "eval_steps": 1000,
6
+ "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4937
  "eval_steps_per_second": 1.554,
4938
  "eval_wer": 82.998585572843,
4939
  "step": 17000
4940
+ },
4941
+ {
4942
+ "epoch": 18.75,
4943
+ "grad_norm": 0.04777693375945091,
4944
+ "learning_rate": 3.051282051282052e-06,
4945
+ "loss": 0.0023,
4946
+ "step": 17025
4947
+ },
4948
+ {
4949
+ "epoch": 18.777533039647576,
4950
+ "grad_norm": 0.012851621024310589,
4951
+ "learning_rate": 3.0256410256410256e-06,
4952
+ "loss": 0.0016,
4953
+ "step": 17050
4954
+ },
4955
+ {
4956
+ "epoch": 18.805066079295155,
4957
+ "grad_norm": 0.07990699261426926,
4958
+ "learning_rate": 3e-06,
4959
+ "loss": 0.0016,
4960
+ "step": 17075
4961
+ },
4962
+ {
4963
+ "epoch": 18.83259911894273,
4964
+ "grad_norm": 0.011805381625890732,
4965
+ "learning_rate": 2.9743589743589746e-06,
4966
+ "loss": 0.0027,
4967
+ "step": 17100
4968
+ },
4969
+ {
4970
+ "epoch": 18.860132158590307,
4971
+ "grad_norm": 0.14670372009277344,
4972
+ "learning_rate": 2.948717948717949e-06,
4973
+ "loss": 0.0026,
4974
+ "step": 17125
4975
+ },
4976
+ {
4977
+ "epoch": 18.887665198237887,
4978
+ "grad_norm": 0.023519041016697884,
4979
+ "learning_rate": 2.9230769230769236e-06,
4980
+ "loss": 0.0028,
4981
+ "step": 17150
4982
+ },
4983
+ {
4984
+ "epoch": 18.915198237885463,
4985
+ "grad_norm": 0.021847659721970558,
4986
+ "learning_rate": 2.897435897435898e-06,
4987
+ "loss": 0.0015,
4988
+ "step": 17175
4989
+ },
4990
+ {
4991
+ "epoch": 18.94273127753304,
4992
+ "grad_norm": 0.013796437531709671,
4993
+ "learning_rate": 2.8717948717948717e-06,
4994
+ "loss": 0.0023,
4995
+ "step": 17200
4996
+ },
4997
+ {
4998
+ "epoch": 18.970264317180618,
4999
+ "grad_norm": 0.1518554836511612,
5000
+ "learning_rate": 2.846153846153846e-06,
5001
+ "loss": 0.0016,
5002
+ "step": 17225
5003
+ },
5004
+ {
5005
+ "epoch": 18.997797356828194,
5006
+ "grad_norm": 0.012883415445685387,
5007
+ "learning_rate": 2.8205128205128207e-06,
5008
+ "loss": 0.0019,
5009
+ "step": 17250
5010
+ },
5011
+ {
5012
+ "epoch": 19.02533039647577,
5013
+ "grad_norm": 0.01099941972643137,
5014
+ "learning_rate": 2.794871794871795e-06,
5015
+ "loss": 0.0022,
5016
+ "step": 17275
5017
+ },
5018
+ {
5019
+ "epoch": 19.05286343612335,
5020
+ "grad_norm": 0.006992665119469166,
5021
+ "learning_rate": 2.7692307692307697e-06,
5022
+ "loss": 0.0011,
5023
+ "step": 17300
5024
+ },
5025
+ {
5026
+ "epoch": 19.080396475770925,
5027
+ "grad_norm": 0.012264972552657127,
5028
+ "learning_rate": 2.743589743589744e-06,
5029
+ "loss": 0.0014,
5030
+ "step": 17325
5031
+ },
5032
+ {
5033
+ "epoch": 19.1079295154185,
5034
+ "grad_norm": 0.04312492161989212,
5035
+ "learning_rate": 2.717948717948718e-06,
5036
+ "loss": 0.0012,
5037
+ "step": 17350
5038
+ },
5039
+ {
5040
+ "epoch": 19.13546255506608,
5041
+ "grad_norm": 0.008214226923882961,
5042
+ "learning_rate": 2.6923076923076923e-06,
5043
+ "loss": 0.0011,
5044
+ "step": 17375
5045
+ },
5046
+ {
5047
+ "epoch": 19.162995594713657,
5048
+ "grad_norm": 0.009182457812130451,
5049
+ "learning_rate": 2.666666666666667e-06,
5050
+ "loss": 0.0011,
5051
+ "step": 17400
5052
+ },
5053
+ {
5054
+ "epoch": 19.190528634361232,
5055
+ "grad_norm": 0.009743117727339268,
5056
+ "learning_rate": 2.6410256410256413e-06,
5057
+ "loss": 0.001,
5058
+ "step": 17425
5059
+ },
5060
+ {
5061
+ "epoch": 19.218061674008812,
5062
+ "grad_norm": 0.011959163472056389,
5063
+ "learning_rate": 2.615384615384616e-06,
5064
+ "loss": 0.001,
5065
+ "step": 17450
5066
+ },
5067
+ {
5068
+ "epoch": 19.245594713656388,
5069
+ "grad_norm": 0.033681828528642654,
5070
+ "learning_rate": 2.5897435897435903e-06,
5071
+ "loss": 0.0019,
5072
+ "step": 17475
5073
+ },
5074
+ {
5075
+ "epoch": 19.273127753303964,
5076
+ "grad_norm": 0.012354315258562565,
5077
+ "learning_rate": 2.564102564102564e-06,
5078
+ "loss": 0.0028,
5079
+ "step": 17500
5080
+ },
5081
+ {
5082
+ "epoch": 19.300660792951543,
5083
+ "grad_norm": 0.01059970073401928,
5084
+ "learning_rate": 2.5384615384615385e-06,
5085
+ "loss": 0.0018,
5086
+ "step": 17525
5087
+ },
5088
+ {
5089
+ "epoch": 19.32819383259912,
5090
+ "grad_norm": 0.007629127707332373,
5091
+ "learning_rate": 2.512820512820513e-06,
5092
+ "loss": 0.001,
5093
+ "step": 17550
5094
+ },
5095
+ {
5096
+ "epoch": 19.355726872246695,
5097
+ "grad_norm": 0.0125362453982234,
5098
+ "learning_rate": 2.4871794871794875e-06,
5099
+ "loss": 0.001,
5100
+ "step": 17575
5101
+ },
5102
+ {
5103
+ "epoch": 19.383259911894275,
5104
+ "grad_norm": 0.01261002104729414,
5105
+ "learning_rate": 2.461538461538462e-06,
5106
+ "loss": 0.0014,
5107
+ "step": 17600
5108
+ },
5109
+ {
5110
+ "epoch": 19.41079295154185,
5111
+ "grad_norm": 0.010447504930198193,
5112
+ "learning_rate": 2.435897435897436e-06,
5113
+ "loss": 0.0021,
5114
+ "step": 17625
5115
+ },
5116
+ {
5117
+ "epoch": 19.438325991189426,
5118
+ "grad_norm": 0.009724145755171776,
5119
+ "learning_rate": 2.4102564102564105e-06,
5120
+ "loss": 0.0021,
5121
+ "step": 17650
5122
+ },
5123
+ {
5124
+ "epoch": 19.465859030837006,
5125
+ "grad_norm": 0.008591737598180771,
5126
+ "learning_rate": 2.384615384615385e-06,
5127
+ "loss": 0.0013,
5128
+ "step": 17675
5129
+ },
5130
+ {
5131
+ "epoch": 19.493392070484582,
5132
+ "grad_norm": 0.008385499939322472,
5133
+ "learning_rate": 2.358974358974359e-06,
5134
+ "loss": 0.0017,
5135
+ "step": 17700
5136
+ },
5137
+ {
5138
+ "epoch": 19.520925110132158,
5139
+ "grad_norm": 0.04597390815615654,
5140
+ "learning_rate": 2.3333333333333336e-06,
5141
+ "loss": 0.0013,
5142
+ "step": 17725
5143
+ },
5144
+ {
5145
+ "epoch": 19.548458149779737,
5146
+ "grad_norm": 0.00930617842823267,
5147
+ "learning_rate": 2.307692307692308e-06,
5148
+ "loss": 0.0016,
5149
+ "step": 17750
5150
+ },
5151
+ {
5152
+ "epoch": 19.575991189427313,
5153
+ "grad_norm": 0.009862055070698261,
5154
+ "learning_rate": 2.282051282051282e-06,
5155
+ "loss": 0.0014,
5156
+ "step": 17775
5157
+ },
5158
+ {
5159
+ "epoch": 19.60352422907489,
5160
+ "grad_norm": 0.01388918049633503,
5161
+ "learning_rate": 2.2564102564102566e-06,
5162
+ "loss": 0.0011,
5163
+ "step": 17800
5164
+ },
5165
+ {
5166
+ "epoch": 19.63105726872247,
5167
+ "grad_norm": 0.010380508378148079,
5168
+ "learning_rate": 2.230769230769231e-06,
5169
+ "loss": 0.0022,
5170
+ "step": 17825
5171
+ },
5172
+ {
5173
+ "epoch": 19.658590308370044,
5174
+ "grad_norm": 0.003493061987683177,
5175
+ "learning_rate": 2.2051282051282052e-06,
5176
+ "loss": 0.001,
5177
+ "step": 17850
5178
+ },
5179
+ {
5180
+ "epoch": 19.68612334801762,
5181
+ "grad_norm": 0.00607143621891737,
5182
+ "learning_rate": 2.1794871794871797e-06,
5183
+ "loss": 0.0016,
5184
+ "step": 17875
5185
+ },
5186
+ {
5187
+ "epoch": 19.7136563876652,
5188
+ "grad_norm": 0.007698683068156242,
5189
+ "learning_rate": 2.153846153846154e-06,
5190
+ "loss": 0.0029,
5191
+ "step": 17900
5192
+ },
5193
+ {
5194
+ "epoch": 19.741189427312776,
5195
+ "grad_norm": 0.007107453886419535,
5196
+ "learning_rate": 2.1282051282051283e-06,
5197
+ "loss": 0.0018,
5198
+ "step": 17925
5199
+ },
5200
+ {
5201
+ "epoch": 19.76872246696035,
5202
+ "grad_norm": 0.0059033227153122425,
5203
+ "learning_rate": 2.1025641025641028e-06,
5204
+ "loss": 0.001,
5205
+ "step": 17950
5206
+ },
5207
+ {
5208
+ "epoch": 19.79625550660793,
5209
+ "grad_norm": 0.005275961942970753,
5210
+ "learning_rate": 2.0769230769230773e-06,
5211
+ "loss": 0.0026,
5212
+ "step": 17975
5213
+ },
5214
+ {
5215
+ "epoch": 19.823788546255507,
5216
+ "grad_norm": 0.016638007014989853,
5217
+ "learning_rate": 2.0512820512820513e-06,
5218
+ "loss": 0.0019,
5219
+ "step": 18000
5220
+ },
5221
+ {
5222
+ "epoch": 19.823788546255507,
5223
+ "eval_cer": 22.66344158747263,
5224
+ "eval_loss": 0.8900153040885925,
5225
+ "eval_runtime": 1717.0751,
5226
+ "eval_samples_per_second": 6.162,
5227
+ "eval_steps_per_second": 1.541,
5228
+ "eval_wer": 82.50825082508251,
5229
+ "step": 18000
5230
+ },
5231
+ {
5232
+ "epoch": 19.851321585903083,
5233
+ "grad_norm": 0.0051730177365243435,
5234
+ "learning_rate": 2.025641025641026e-06,
5235
+ "loss": 0.0013,
5236
+ "step": 18025
5237
+ },
5238
+ {
5239
+ "epoch": 19.878854625550662,
5240
+ "grad_norm": 0.00516405189409852,
5241
+ "learning_rate": 2.0000000000000003e-06,
5242
+ "loss": 0.0018,
5243
+ "step": 18050
5244
+ },
5245
+ {
5246
+ "epoch": 19.90638766519824,
5247
+ "grad_norm": 0.006816135719418526,
5248
+ "learning_rate": 1.9743589743589744e-06,
5249
+ "loss": 0.001,
5250
+ "step": 18075
5251
+ },
5252
+ {
5253
+ "epoch": 19.933920704845814,
5254
+ "grad_norm": 0.005780714098364115,
5255
+ "learning_rate": 1.948717948717949e-06,
5256
+ "loss": 0.0009,
5257
+ "step": 18100
5258
+ },
5259
+ {
5260
+ "epoch": 19.961453744493394,
5261
+ "grad_norm": 0.007895824499428272,
5262
+ "learning_rate": 1.9230769230769234e-06,
5263
+ "loss": 0.0011,
5264
+ "step": 18125
5265
+ },
5266
+ {
5267
+ "epoch": 19.98898678414097,
5268
+ "grad_norm": 0.00839215237647295,
5269
+ "learning_rate": 1.8974358974358975e-06,
5270
+ "loss": 0.0011,
5271
+ "step": 18150
5272
+ },
5273
+ {
5274
+ "epoch": 20.016519823788546,
5275
+ "grad_norm": 0.0035141175612807274,
5276
+ "learning_rate": 1.871794871794872e-06,
5277
+ "loss": 0.0011,
5278
+ "step": 18175
5279
+ },
5280
+ {
5281
+ "epoch": 20.044052863436125,
5282
+ "grad_norm": 0.008937545120716095,
5283
+ "learning_rate": 1.8461538461538465e-06,
5284
+ "loss": 0.0009,
5285
+ "step": 18200
5286
+ },
5287
+ {
5288
+ "epoch": 20.0715859030837,
5289
+ "grad_norm": 0.0037842292804270983,
5290
+ "learning_rate": 1.8205128205128205e-06,
5291
+ "loss": 0.0011,
5292
+ "step": 18225
5293
+ },
5294
+ {
5295
+ "epoch": 20.099118942731277,
5296
+ "grad_norm": 0.003870155429467559,
5297
+ "learning_rate": 1.794871794871795e-06,
5298
+ "loss": 0.0009,
5299
+ "step": 18250
5300
+ },
5301
+ {
5302
+ "epoch": 20.126651982378856,
5303
+ "grad_norm": 0.003817240707576275,
5304
+ "learning_rate": 1.7692307692307695e-06,
5305
+ "loss": 0.0009,
5306
+ "step": 18275
5307
+ },
5308
+ {
5309
+ "epoch": 20.154185022026432,
5310
+ "grad_norm": 0.007133571431040764,
5311
+ "learning_rate": 1.7435897435897436e-06,
5312
+ "loss": 0.0008,
5313
+ "step": 18300
5314
+ },
5315
+ {
5316
+ "epoch": 20.181718061674008,
5317
+ "grad_norm": 0.011461510322988033,
5318
+ "learning_rate": 1.717948717948718e-06,
5319
+ "loss": 0.0007,
5320
+ "step": 18325
5321
+ },
5322
+ {
5323
+ "epoch": 20.209251101321588,
5324
+ "grad_norm": 0.003969813231378794,
5325
+ "learning_rate": 1.6923076923076926e-06,
5326
+ "loss": 0.001,
5327
+ "step": 18350
5328
+ },
5329
+ {
5330
+ "epoch": 20.236784140969164,
5331
+ "grad_norm": 0.007272036280483007,
5332
+ "learning_rate": 1.6666666666666667e-06,
5333
+ "loss": 0.001,
5334
+ "step": 18375
5335
+ },
5336
+ {
5337
+ "epoch": 20.26431718061674,
5338
+ "grad_norm": 0.006936676800251007,
5339
+ "learning_rate": 1.6410256410256412e-06,
5340
+ "loss": 0.0009,
5341
+ "step": 18400
5342
+ },
5343
+ {
5344
+ "epoch": 20.291850220264315,
5345
+ "grad_norm": 0.005403169430792332,
5346
+ "learning_rate": 1.6153846153846157e-06,
5347
+ "loss": 0.0007,
5348
+ "step": 18425
5349
+ },
5350
+ {
5351
+ "epoch": 20.319383259911895,
5352
+ "grad_norm": 0.009516764432191849,
5353
+ "learning_rate": 1.5897435897435897e-06,
5354
+ "loss": 0.0029,
5355
+ "step": 18450
5356
+ },
5357
+ {
5358
+ "epoch": 20.34691629955947,
5359
+ "grad_norm": 0.003727905685082078,
5360
+ "learning_rate": 1.5641025641025642e-06,
5361
+ "loss": 0.0008,
5362
+ "step": 18475
5363
+ },
5364
+ {
5365
+ "epoch": 20.374449339207047,
5366
+ "grad_norm": 0.006022660061717033,
5367
+ "learning_rate": 1.5384615384615387e-06,
5368
+ "loss": 0.002,
5369
+ "step": 18500
5370
+ },
5371
+ {
5372
+ "epoch": 20.401982378854626,
5373
+ "grad_norm": 0.004205208737403154,
5374
+ "learning_rate": 1.5128205128205128e-06,
5375
+ "loss": 0.001,
5376
+ "step": 18525
5377
+ },
5378
+ {
5379
+ "epoch": 20.429515418502202,
5380
+ "grad_norm": 0.10070935636758804,
5381
+ "learning_rate": 1.4871794871794873e-06,
5382
+ "loss": 0.0009,
5383
+ "step": 18550
5384
+ },
5385
+ {
5386
+ "epoch": 20.457048458149778,
5387
+ "grad_norm": 0.004871605895459652,
5388
+ "learning_rate": 1.4615384615384618e-06,
5389
+ "loss": 0.0009,
5390
+ "step": 18575
5391
+ },
5392
+ {
5393
+ "epoch": 20.484581497797357,
5394
+ "grad_norm": 0.005528348032385111,
5395
+ "learning_rate": 1.4358974358974359e-06,
5396
+ "loss": 0.0008,
5397
+ "step": 18600
5398
+ },
5399
+ {
5400
+ "epoch": 20.512114537444933,
5401
+ "grad_norm": 0.007922505959868431,
5402
+ "learning_rate": 1.4102564102564104e-06,
5403
+ "loss": 0.0007,
5404
+ "step": 18625
5405
+ },
5406
+ {
5407
+ "epoch": 20.53964757709251,
5408
+ "grad_norm": 0.004503941163420677,
5409
+ "learning_rate": 1.3846153846153848e-06,
5410
+ "loss": 0.001,
5411
+ "step": 18650
5412
+ },
5413
+ {
5414
+ "epoch": 20.56718061674009,
5415
+ "grad_norm": 0.04012945666909218,
5416
+ "learning_rate": 1.358974358974359e-06,
5417
+ "loss": 0.0011,
5418
+ "step": 18675
5419
+ },
5420
+ {
5421
+ "epoch": 20.594713656387665,
5422
+ "grad_norm": 0.011533623561263084,
5423
+ "learning_rate": 1.3333333333333334e-06,
5424
+ "loss": 0.0011,
5425
+ "step": 18700
5426
+ },
5427
+ {
5428
+ "epoch": 20.62224669603524,
5429
+ "grad_norm": 0.008248466067016125,
5430
+ "learning_rate": 1.307692307692308e-06,
5431
+ "loss": 0.0009,
5432
+ "step": 18725
5433
+ },
5434
+ {
5435
+ "epoch": 20.64977973568282,
5436
+ "grad_norm": 0.004799861926585436,
5437
+ "learning_rate": 1.282051282051282e-06,
5438
+ "loss": 0.0007,
5439
+ "step": 18750
5440
+ },
5441
+ {
5442
+ "epoch": 20.677312775330396,
5443
+ "grad_norm": 0.006359547842293978,
5444
+ "learning_rate": 1.2564102564102565e-06,
5445
+ "loss": 0.0007,
5446
+ "step": 18775
5447
+ },
5448
+ {
5449
+ "epoch": 20.704845814977972,
5450
+ "grad_norm": 0.006216075737029314,
5451
+ "learning_rate": 1.230769230769231e-06,
5452
+ "loss": 0.001,
5453
+ "step": 18800
5454
+ },
5455
+ {
5456
+ "epoch": 20.73237885462555,
5457
+ "grad_norm": 0.08518233150243759,
5458
+ "learning_rate": 1.2051282051282053e-06,
5459
+ "loss": 0.0012,
5460
+ "step": 18825
5461
+ },
5462
+ {
5463
+ "epoch": 20.759911894273127,
5464
+ "grad_norm": 0.004133372101932764,
5465
+ "learning_rate": 1.1794871794871795e-06,
5466
+ "loss": 0.001,
5467
+ "step": 18850
5468
+ },
5469
+ {
5470
+ "epoch": 20.787444933920703,
5471
+ "grad_norm": 0.006971430499106646,
5472
+ "learning_rate": 1.153846153846154e-06,
5473
+ "loss": 0.0014,
5474
+ "step": 18875
5475
+ },
5476
+ {
5477
+ "epoch": 20.814977973568283,
5478
+ "grad_norm": 0.005109596531838179,
5479
+ "learning_rate": 1.1282051282051283e-06,
5480
+ "loss": 0.0011,
5481
+ "step": 18900
5482
+ },
5483
+ {
5484
+ "epoch": 20.84251101321586,
5485
+ "grad_norm": 0.038249921053647995,
5486
+ "learning_rate": 1.1025641025641026e-06,
5487
+ "loss": 0.0012,
5488
+ "step": 18925
5489
+ },
5490
+ {
5491
+ "epoch": 20.870044052863435,
5492
+ "grad_norm": 0.008875112980604172,
5493
+ "learning_rate": 1.076923076923077e-06,
5494
+ "loss": 0.0007,
5495
+ "step": 18950
5496
+ },
5497
+ {
5498
+ "epoch": 20.897577092511014,
5499
+ "grad_norm": 0.0044938609935343266,
5500
+ "learning_rate": 1.0512820512820514e-06,
5501
+ "loss": 0.0011,
5502
+ "step": 18975
5503
+ },
5504
+ {
5505
+ "epoch": 20.92511013215859,
5506
+ "grad_norm": 0.07247400283813477,
5507
+ "learning_rate": 1.0256410256410257e-06,
5508
+ "loss": 0.0008,
5509
+ "step": 19000
5510
+ },
5511
+ {
5512
+ "epoch": 20.92511013215859,
5513
+ "eval_cer": 22.58778214666468,
5514
+ "eval_loss": 0.892371654510498,
5515
+ "eval_runtime": 1719.93,
5516
+ "eval_samples_per_second": 6.152,
5517
+ "eval_steps_per_second": 1.538,
5518
+ "eval_wer": 82.47996228194248,
5519
+ "step": 19000
5520
+ },
5521
+ {
5522
+ "epoch": 20.952643171806166,
5523
+ "grad_norm": 0.006040550768375397,
5524
+ "learning_rate": 1.0000000000000002e-06,
5525
+ "loss": 0.0007,
5526
+ "step": 19025
5527
+ },
5528
+ {
5529
+ "epoch": 20.980176211453745,
5530
+ "grad_norm": 0.00338306394405663,
5531
+ "learning_rate": 9.743589743589745e-07,
5532
+ "loss": 0.001,
5533
+ "step": 19050
5534
+ },
5535
+ {
5536
+ "epoch": 21.00770925110132,
5537
+ "grad_norm": 0.007667516358196735,
5538
+ "learning_rate": 9.487179487179487e-07,
5539
+ "loss": 0.0012,
5540
+ "step": 19075
5541
+ },
5542
+ {
5543
+ "epoch": 21.035242290748897,
5544
+ "grad_norm": 0.0036987056955695152,
5545
+ "learning_rate": 9.230769230769232e-07,
5546
+ "loss": 0.0006,
5547
+ "step": 19100
5548
+ },
5549
+ {
5550
+ "epoch": 21.062775330396477,
5551
+ "grad_norm": 0.0036683231592178345,
5552
+ "learning_rate": 8.974358974358975e-07,
5553
+ "loss": 0.0011,
5554
+ "step": 19125
5555
+ },
5556
+ {
5557
+ "epoch": 21.090308370044053,
5558
+ "grad_norm": 0.007168483920395374,
5559
+ "learning_rate": 8.717948717948718e-07,
5560
+ "loss": 0.0009,
5561
+ "step": 19150
5562
+ },
5563
+ {
5564
+ "epoch": 21.11784140969163,
5565
+ "grad_norm": 0.0029900213703513145,
5566
+ "learning_rate": 8.461538461538463e-07,
5567
+ "loss": 0.0017,
5568
+ "step": 19175
5569
+ },
5570
+ {
5571
+ "epoch": 21.145374449339208,
5572
+ "grad_norm": 0.00418079923838377,
5573
+ "learning_rate": 8.205128205128206e-07,
5574
+ "loss": 0.0009,
5575
+ "step": 19200
5576
+ },
5577
+ {
5578
+ "epoch": 21.172907488986784,
5579
+ "grad_norm": 0.003424633527174592,
5580
+ "learning_rate": 7.948717948717949e-07,
5581
+ "loss": 0.0007,
5582
+ "step": 19225
5583
+ },
5584
+ {
5585
+ "epoch": 21.20044052863436,
5586
+ "grad_norm": 0.0028422100003808737,
5587
+ "learning_rate": 7.692307692307694e-07,
5588
+ "loss": 0.0006,
5589
+ "step": 19250
5590
+ },
5591
+ {
5592
+ "epoch": 21.22797356828194,
5593
+ "grad_norm": 0.004691548179835081,
5594
+ "learning_rate": 7.435897435897436e-07,
5595
+ "loss": 0.0006,
5596
+ "step": 19275
5597
+ },
5598
+ {
5599
+ "epoch": 21.255506607929515,
5600
+ "grad_norm": 0.004589064046740532,
5601
+ "learning_rate": 7.179487179487179e-07,
5602
+ "loss": 0.0005,
5603
+ "step": 19300
5604
+ },
5605
+ {
5606
+ "epoch": 21.28303964757709,
5607
+ "grad_norm": 0.005557245574891567,
5608
+ "learning_rate": 6.923076923076924e-07,
5609
+ "loss": 0.0011,
5610
+ "step": 19325
5611
+ },
5612
+ {
5613
+ "epoch": 21.31057268722467,
5614
+ "grad_norm": 0.0031431138049811125,
5615
+ "learning_rate": 6.666666666666667e-07,
5616
+ "loss": 0.0006,
5617
+ "step": 19350
5618
+ },
5619
+ {
5620
+ "epoch": 21.338105726872246,
5621
+ "grad_norm": 0.004688850603997707,
5622
+ "learning_rate": 6.41025641025641e-07,
5623
+ "loss": 0.0007,
5624
+ "step": 19375
5625
+ },
5626
+ {
5627
+ "epoch": 21.365638766519822,
5628
+ "grad_norm": 0.007398667279630899,
5629
+ "learning_rate": 6.153846153846155e-07,
5630
+ "loss": 0.0006,
5631
+ "step": 19400
5632
+ },
5633
+ {
5634
+ "epoch": 21.393171806167402,
5635
+ "grad_norm": 0.005217025522142649,
5636
+ "learning_rate": 5.897435897435898e-07,
5637
+ "loss": 0.0008,
5638
+ "step": 19425
5639
+ },
5640
+ {
5641
+ "epoch": 21.420704845814978,
5642
+ "grad_norm": 0.004331599920988083,
5643
+ "learning_rate": 5.641025641025642e-07,
5644
+ "loss": 0.0006,
5645
+ "step": 19450
5646
+ },
5647
+ {
5648
+ "epoch": 21.448237885462554,
5649
+ "grad_norm": 0.004927519708871841,
5650
+ "learning_rate": 5.384615384615386e-07,
5651
+ "loss": 0.0009,
5652
+ "step": 19475
5653
+ },
5654
+ {
5655
+ "epoch": 21.475770925110133,
5656
+ "grad_norm": 0.0034796635154634714,
5657
+ "learning_rate": 5.128205128205128e-07,
5658
+ "loss": 0.001,
5659
+ "step": 19500
5660
+ },
5661
+ {
5662
+ "epoch": 21.50330396475771,
5663
+ "grad_norm": 0.00347193144261837,
5664
+ "learning_rate": 4.871794871794872e-07,
5665
+ "loss": 0.0006,
5666
+ "step": 19525
5667
+ },
5668
+ {
5669
+ "epoch": 21.530837004405285,
5670
+ "grad_norm": 0.0074023474007844925,
5671
+ "learning_rate": 4.615384615384616e-07,
5672
+ "loss": 0.0006,
5673
+ "step": 19550
5674
+ },
5675
+ {
5676
+ "epoch": 21.558370044052865,
5677
+ "grad_norm": 0.0036716184113174677,
5678
+ "learning_rate": 4.358974358974359e-07,
5679
+ "loss": 0.0006,
5680
+ "step": 19575
5681
+ },
5682
+ {
5683
+ "epoch": 21.58590308370044,
5684
+ "grad_norm": 0.006558453664183617,
5685
+ "learning_rate": 4.102564102564103e-07,
5686
+ "loss": 0.0007,
5687
+ "step": 19600
5688
+ },
5689
+ {
5690
+ "epoch": 21.613436123348016,
5691
+ "grad_norm": 0.0030144904740154743,
5692
+ "learning_rate": 3.846153846153847e-07,
5693
+ "loss": 0.0007,
5694
+ "step": 19625
5695
+ },
5696
+ {
5697
+ "epoch": 21.640969162995596,
5698
+ "grad_norm": 0.0037687935400754213,
5699
+ "learning_rate": 3.5897435897435896e-07,
5700
+ "loss": 0.0007,
5701
+ "step": 19650
5702
+ },
5703
+ {
5704
+ "epoch": 21.66850220264317,
5705
+ "grad_norm": 0.0722261294722557,
5706
+ "learning_rate": 3.3333333333333335e-07,
5707
+ "loss": 0.0007,
5708
+ "step": 19675
5709
+ },
5710
+ {
5711
+ "epoch": 21.696035242290748,
5712
+ "grad_norm": 0.0034861781168729067,
5713
+ "learning_rate": 3.0769230769230774e-07,
5714
+ "loss": 0.0007,
5715
+ "step": 19700
5716
+ },
5717
+ {
5718
+ "epoch": 21.723568281938327,
5719
+ "grad_norm": 0.004740406293421984,
5720
+ "learning_rate": 2.820512820512821e-07,
5721
+ "loss": 0.0009,
5722
+ "step": 19725
5723
+ },
5724
+ {
5725
+ "epoch": 21.751101321585903,
5726
+ "grad_norm": 0.0040426794439554214,
5727
+ "learning_rate": 2.564102564102564e-07,
5728
+ "loss": 0.0007,
5729
+ "step": 19750
5730
+ },
5731
+ {
5732
+ "epoch": 21.77863436123348,
5733
+ "grad_norm": 0.005103557836264372,
5734
+ "learning_rate": 2.307692307692308e-07,
5735
+ "loss": 0.0006,
5736
+ "step": 19775
5737
+ },
5738
+ {
5739
+ "epoch": 21.80616740088106,
5740
+ "grad_norm": 0.007594733498990536,
5741
+ "learning_rate": 2.0512820512820514e-07,
5742
+ "loss": 0.0006,
5743
+ "step": 19800
5744
+ },
5745
+ {
5746
+ "epoch": 21.833700440528634,
5747
+ "grad_norm": 0.004270041361451149,
5748
+ "learning_rate": 1.7948717948717948e-07,
5749
+ "loss": 0.0007,
5750
+ "step": 19825
5751
+ },
5752
+ {
5753
+ "epoch": 21.86123348017621,
5754
+ "grad_norm": 0.00658000260591507,
5755
+ "learning_rate": 1.5384615384615387e-07,
5756
+ "loss": 0.0006,
5757
+ "step": 19850
5758
+ },
5759
+ {
5760
+ "epoch": 21.88876651982379,
5761
+ "grad_norm": 0.004829788114875555,
5762
+ "learning_rate": 1.282051282051282e-07,
5763
+ "loss": 0.0005,
5764
+ "step": 19875
5765
+ },
5766
+ {
5767
+ "epoch": 21.916299559471366,
5768
+ "grad_norm": 0.004017261788249016,
5769
+ "learning_rate": 1.0256410256410257e-07,
5770
+ "loss": 0.0006,
5771
+ "step": 19900
5772
+ },
5773
+ {
5774
+ "epoch": 21.94383259911894,
5775
+ "grad_norm": 0.005543394014239311,
5776
+ "learning_rate": 7.692307692307694e-08,
5777
+ "loss": 0.0009,
5778
+ "step": 19925
5779
+ },
5780
+ {
5781
+ "epoch": 21.97136563876652,
5782
+ "grad_norm": 0.006894242484122515,
5783
+ "learning_rate": 5.1282051282051286e-08,
5784
+ "loss": 0.0006,
5785
+ "step": 19950
5786
+ },
5787
+ {
5788
+ "epoch": 21.998898678414097,
5789
+ "grad_norm": 0.004000292159616947,
5790
+ "learning_rate": 2.5641025641025643e-08,
5791
+ "loss": 0.0007,
5792
+ "step": 19975
5793
+ },
5794
+ {
5795
+ "epoch": 22.026431718061673,
5796
+ "grad_norm": 0.004270936828106642,
5797
+ "learning_rate": 0.0,
5798
+ "loss": 0.0006,
5799
+ "step": 20000
5800
+ },
5801
+ {
5802
+ "epoch": 22.026431718061673,
5803
+ "eval_cer": 22.62675822223241,
5804
+ "eval_loss": 0.8947405219078064,
5805
+ "eval_runtime": 1706.5603,
5806
+ "eval_samples_per_second": 6.2,
5807
+ "eval_steps_per_second": 1.55,
5808
+ "eval_wer": 82.34794908062236,
5809
+ "step": 20000
5810
  }
5811
  ],
5812
  "logging_steps": 25,
 
5821
  "should_evaluate": false,
5822
  "should_log": false,
5823
  "should_save": true,
5824
+ "should_training_stop": true
5825
  },
5826
  "attributes": {}
5827
  }
5828
  },
5829
+ "total_flos": 3.4362863729801953e+20,
5830
  "train_batch_size": 4,
5831
  "trial_name": null,
5832
  "trial_params": null