ctaguchi commited on
Commit
331124b
·
verified ·
1 Parent(s): 85e639b

Model save

Browse files
Files changed (2) hide show
  1. README.md +74 -38
  2. trainer_state.json +873 -297
README.md CHANGED
@@ -16,8 +16,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 1.4788
20
- - Cer: 0.4792
21
 
22
  ## Model description
23
 
@@ -45,48 +45,84 @@ The following hyperparameters were used during training:
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: linear
47
  - lr_scheduler_warmup_steps: 100
48
- - num_epochs: 50
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Cer |
54
  |:-------------:|:-------:|:----:|:---------------:|:------:|
55
- | 11.3907 | 1.3916 | 100 | 3.8605 | 0.9907 |
56
- | 3.9297 | 2.7832 | 200 | 3.8235 | 0.9905 |
57
- | 4.2491 | 4.1678 | 300 | 3.8157 | 0.9907 |
58
- | 3.8734 | 5.5594 | 400 | 3.7852 | 0.9903 |
59
- | 3.828 | 6.9510 | 500 | 3.8629 | 0.9901 |
60
- | 3.7617 | 8.3357 | 600 | 3.7294 | 0.9903 |
61
- | 3.7543 | 9.7273 | 700 | 3.6791 | 0.9907 |
62
- | 3.6727 | 11.1119 | 800 | 3.6882 | 0.9901 |
63
- | 3.6469 | 12.5035 | 900 | 3.6921 | 0.9907 |
64
- | 3.6337 | 13.8951 | 1000 | 3.6389 | 0.9897 |
65
- | 3.5753 | 15.2797 | 1100 | 3.5900 | 0.9870 |
66
- | 3.5827 | 16.6713 | 1200 | 3.5274 | 0.9697 |
67
- | 3.4855 | 18.0559 | 1300 | 3.4092 | 0.9661 |
68
- | 3.4311 | 19.4476 | 1400 | 3.3445 | 0.9744 |
69
- | 3.3671 | 20.8392 | 1500 | 3.2508 | 0.9647 |
70
- | 3.2572 | 22.2238 | 1600 | 3.1160 | 0.9697 |
71
- | 3.1242 | 23.6154 | 1700 | 2.8400 | 0.9538 |
72
- | 2.9165 | 25.0 | 1800 | 2.5780 | 0.8658 |
73
- | 2.677 | 26.3916 | 1900 | 2.2887 | 0.7663 |
74
- | 2.4528 | 27.7832 | 2000 | 2.0577 | 0.7150 |
75
- | 2.2515 | 29.1678 | 2100 | 1.8437 | 0.7130 |
76
- | 2.0648 | 30.5594 | 2200 | 1.7249 | 0.6420 |
77
- | 1.9139 | 31.9510 | 2300 | 1.7222 | 0.6655 |
78
- | 1.7839 | 33.3357 | 2400 | 1.5456 | 0.5844 |
79
- | 1.6434 | 34.7273 | 2500 | 1.5248 | 0.5709 |
80
- | 1.518 | 36.1119 | 2600 | 1.3946 | 0.5243 |
81
- | 1.4163 | 37.5035 | 2700 | 1.3937 | 0.5369 |
82
- | 1.3139 | 38.8951 | 2800 | 1.4023 | 0.5115 |
83
- | 1.2228 | 40.2797 | 2900 | 1.3532 | 0.5061 |
84
- | 1.126 | 41.6713 | 3000 | 1.4181 | 0.4991 |
85
- | 1.0416 | 43.0559 | 3100 | 1.4076 | 0.4900 |
86
- | 0.9605 | 44.4476 | 3200 | 1.4472 | 0.4923 |
87
- | 0.9086 | 45.8392 | 3300 | 1.4480 | 0.4846 |
88
- | 0.8519 | 47.2238 | 3400 | 1.4582 | 0.4811 |
89
- | 0.8244 | 48.6154 | 3500 | 1.4674 | 0.4832 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 1.6104
20
+ - Cer: 0.3521
21
 
22
  ## Model description
23
 
 
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: linear
47
  - lr_scheduler_warmup_steps: 100
48
+ - num_epochs: 100
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Cer |
54
  |:-------------:|:-------:|:----:|:---------------:|:------:|
55
+ | 11.1682 | 1.3916 | 100 | 3.8553 | 0.9903 |
56
+ | 3.9311 | 2.7832 | 200 | 3.8241 | 0.9903 |
57
+ | 3.8623 | 4.1678 | 300 | 3.7760 | 0.9903 |
58
+ | 3.7693 | 5.5594 | 400 | 3.6686 | 0.9903 |
59
+ | 3.671 | 6.9510 | 500 | 3.5900 | 0.9893 |
60
+ | 3.5618 | 8.3357 | 600 | 3.5169 | 0.9713 |
61
+ | 3.4994 | 9.7273 | 700 | 3.3552 | 0.9699 |
62
+ | 3.3323 | 11.1119 | 800 | 3.1385 | 0.9540 |
63
+ | 3.163 | 12.5035 | 900 | 2.9224 | 0.9186 |
64
+ | 2.7901 | 13.8951 | 1000 | 2.1802 | 0.7828 |
65
+ | 2.3425 | 15.2797 | 1100 | 1.8406 | 0.6529 |
66
+ | 2.0608 | 16.6713 | 1200 | 1.6505 | 0.6329 |
67
+ | 1.8813 | 18.0559 | 1300 | 1.4769 | 0.5715 |
68
+ | 1.6705 | 19.4476 | 1400 | 1.4793 | 0.5581 |
69
+ | 1.558 | 20.8392 | 1500 | 1.3079 | 0.4970 |
70
+ | 1.4213 | 22.2238 | 1600 | 1.3552 | 0.4947 |
71
+ | 1.3122 | 23.6154 | 1700 | 1.2368 | 0.4355 |
72
+ | 1.2303 | 25.0 | 1800 | 1.2108 | 0.4347 |
73
+ | 1.1152 | 26.3916 | 1900 | 1.2177 | 0.4307 |
74
+ | 1.0441 | 27.7832 | 2000 | 1.3236 | 0.4291 |
75
+ | 0.9626 | 29.1678 | 2100 | 1.2738 | 0.4157 |
76
+ | 0.8987 | 30.5594 | 2200 | 1.2683 | 0.4190 |
77
+ | 0.8367 | 31.9510 | 2300 | 1.2570 | 0.4144 |
78
+ | 0.7617 | 33.3357 | 2400 | 1.2331 | 0.3876 |
79
+ | 0.7069 | 34.7273 | 2500 | 1.3284 | 0.4037 |
80
+ | 0.6874 | 36.1119 | 2600 | 1.2948 | 0.3818 |
81
+ | 0.6615 | 37.5035 | 2700 | 1.2998 | 0.3977 |
82
+ | 0.6086 | 38.8951 | 2800 | 1.3369 | 0.3758 |
83
+ | 0.5804 | 40.2797 | 2900 | 1.2815 | 0.3838 |
84
+ | 0.548 | 41.6713 | 3000 | 1.3390 | 0.3766 |
85
+ | 0.5239 | 43.0559 | 3100 | 1.2572 | 0.3673 |
86
+ | 0.4983 | 44.4476 | 3200 | 1.2955 | 0.3671 |
87
+ | 0.4793 | 45.8392 | 3300 | 1.3563 | 0.3729 |
88
+ | 0.438 | 47.2238 | 3400 | 1.4153 | 0.3915 |
89
+ | 0.4274 | 48.6154 | 3500 | 1.3198 | 0.3663 |
90
+ | 0.4064 | 50.0 | 3600 | 1.4351 | 0.3814 |
91
+ | 0.3812 | 51.3916 | 3700 | 1.3514 | 0.3620 |
92
+ | 0.3753 | 52.7832 | 3800 | 1.3715 | 0.3492 |
93
+ | 0.3549 | 54.1678 | 3900 | 1.4133 | 0.3649 |
94
+ | 0.3262 | 55.5594 | 4000 | 1.4260 | 0.3574 |
95
+ | 0.3296 | 56.9510 | 4100 | 1.5134 | 0.3552 |
96
+ | 0.3136 | 58.3357 | 4200 | 1.4696 | 0.3587 |
97
+ | 0.3009 | 59.7273 | 4300 | 1.4326 | 0.3554 |
98
+ | 0.2764 | 61.1119 | 4400 | 1.4486 | 0.3572 |
99
+ | 0.2738 | 62.5035 | 4500 | 1.4463 | 0.3593 |
100
+ | 0.2574 | 63.8951 | 4600 | 1.4303 | 0.3583 |
101
+ | 0.2397 | 65.2797 | 4700 | 1.4538 | 0.3446 |
102
+ | 0.2474 | 66.6713 | 4800 | 1.4416 | 0.3496 |
103
+ | 0.2212 | 68.0559 | 4900 | 1.4766 | 0.3448 |
104
+ | 0.2173 | 69.4476 | 5000 | 1.4785 | 0.3496 |
105
+ | 0.2138 | 70.8392 | 5100 | 1.4859 | 0.3582 |
106
+ | 0.2037 | 72.2238 | 5200 | 1.5022 | 0.3500 |
107
+ | 0.194 | 73.6154 | 5300 | 1.4964 | 0.3490 |
108
+ | 0.1758 | 75.0 | 5400 | 1.5645 | 0.3552 |
109
+ | 0.1693 | 76.3916 | 5500 | 1.5215 | 0.3492 |
110
+ | 0.1682 | 77.7832 | 5600 | 1.5572 | 0.3436 |
111
+ | 0.1616 | 79.1678 | 5700 | 1.4971 | 0.3461 |
112
+ | 0.1625 | 80.5594 | 5800 | 1.5327 | 0.3516 |
113
+ | 0.1432 | 81.9510 | 5900 | 1.5595 | 0.3506 |
114
+ | 0.1348 | 83.3357 | 6000 | 1.5562 | 0.3483 |
115
+ | 0.137 | 84.7273 | 6100 | 1.5902 | 0.3485 |
116
+ | 0.1263 | 86.1119 | 6200 | 1.5853 | 0.3521 |
117
+ | 0.1271 | 87.5035 | 6300 | 1.5977 | 0.3488 |
118
+ | 0.123 | 88.8951 | 6400 | 1.6024 | 0.3498 |
119
+ | 0.117 | 90.2797 | 6500 | 1.6093 | 0.3535 |
120
+ | 0.1077 | 91.6713 | 6600 | 1.5807 | 0.3519 |
121
+ | 0.1072 | 93.0559 | 6700 | 1.5801 | 0.3477 |
122
+ | 0.1063 | 94.4476 | 6800 | 1.5894 | 0.3502 |
123
+ | 0.103 | 95.8392 | 6900 | 1.6027 | 0.3498 |
124
+ | 0.1032 | 97.2238 | 7000 | 1.6034 | 0.3485 |
125
+ | 0.0971 | 98.6154 | 7100 | 1.6104 | 0.3481 |
126
 
127
 
128
  ### Framework versions
trainer_state.json CHANGED
@@ -2,596 +2,1172 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 49.30769230769231,
6
  "eval_steps": 100,
7
- "global_step": 3550,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.3916083916083917,
14
- "grad_norm": 0.7624934315681458,
15
  "learning_rate": 0.000285,
16
- "loss": 11.3907,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 1.3916083916083917,
21
- "eval_cer": 0.9906922629435718,
22
- "eval_loss": 3.860499858856201,
23
- "eval_runtime": 6.194,
24
- "eval_samples_per_second": 46.013,
25
- "eval_steps_per_second": 5.812,
26
  "step": 100
27
  },
28
  {
29
  "epoch": 2.7832167832167833,
30
- "grad_norm": 1.6113464832305908,
31
- "learning_rate": 0.0002917391304347826,
32
- "loss": 3.9297,
33
  "step": 200
34
  },
35
  {
36
  "epoch": 2.7832167832167833,
37
- "eval_cer": 0.9904983517548963,
38
- "eval_loss": 3.823500871658325,
39
- "eval_runtime": 6.1712,
40
- "eval_samples_per_second": 46.183,
41
- "eval_steps_per_second": 5.834,
42
  "step": 200
43
  },
44
  {
45
  "epoch": 4.1678321678321675,
46
- "grad_norm": 1.4047558307647705,
47
- "learning_rate": 0.0002832173913043478,
48
- "loss": 4.2491,
49
  "step": 300
50
  },
51
  {
52
  "epoch": 4.1678321678321675,
53
- "eval_cer": 0.9906922629435718,
54
- "eval_loss": 3.8156750202178955,
55
- "eval_runtime": 6.1466,
56
- "eval_samples_per_second": 46.367,
57
- "eval_steps_per_second": 5.857,
58
  "step": 300
59
  },
60
  {
61
  "epoch": 5.559440559440559,
62
- "grad_norm": 3.071784734725952,
63
- "learning_rate": 0.00027452173913043476,
64
- "loss": 3.8734,
65
  "step": 400
66
  },
67
  {
68
  "epoch": 5.559440559440559,
69
  "eval_cer": 0.9903044405662207,
70
- "eval_loss": 3.7851691246032715,
71
- "eval_runtime": 6.1459,
72
- "eval_samples_per_second": 46.372,
73
- "eval_steps_per_second": 5.858,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 6.951048951048951,
78
- "grad_norm": 1.3413110971450806,
79
- "learning_rate": 0.0002658260869565217,
80
- "loss": 3.828,
81
  "step": 500
82
  },
83
  {
84
  "epoch": 6.951048951048951,
85
- "eval_cer": 0.9901105293775451,
86
- "eval_loss": 3.8628756999969482,
87
- "eval_runtime": 6.1561,
88
- "eval_samples_per_second": 46.295,
89
- "eval_steps_per_second": 5.848,
90
  "step": 500
91
  },
92
  {
93
  "epoch": 8.335664335664335,
94
- "grad_norm": 0.5281479358673096,
95
- "learning_rate": 0.0002571304347826087,
96
- "loss": 3.7617,
97
  "step": 600
98
  },
99
  {
100
  "epoch": 8.335664335664335,
101
- "eval_cer": 0.9903044405662207,
102
- "eval_loss": 3.7293624877929688,
103
- "eval_runtime": 6.1456,
104
- "eval_samples_per_second": 46.375,
105
- "eval_steps_per_second": 5.858,
106
  "step": 600
107
  },
108
  {
109
  "epoch": 9.727272727272727,
110
- "grad_norm": 0.4040379226207733,
111
- "learning_rate": 0.00024843478260869564,
112
- "loss": 3.7543,
113
  "step": 700
114
  },
115
  {
116
  "epoch": 9.727272727272727,
117
- "eval_cer": 0.9906922629435718,
118
- "eval_loss": 3.679058790206909,
119
- "eval_runtime": 6.1345,
120
- "eval_samples_per_second": 46.458,
121
- "eval_steps_per_second": 5.868,
122
  "step": 700
123
  },
124
  {
125
  "epoch": 11.111888111888112,
126
- "grad_norm": 0.8422073721885681,
127
- "learning_rate": 0.0002397391304347826,
128
- "loss": 3.6727,
129
  "step": 800
130
  },
131
  {
132
  "epoch": 11.111888111888112,
133
- "eval_cer": 0.9901105293775451,
134
- "eval_loss": 3.6882212162017822,
135
- "eval_runtime": 6.1414,
136
- "eval_samples_per_second": 46.406,
137
- "eval_steps_per_second": 5.862,
138
  "step": 800
139
  },
140
  {
141
  "epoch": 12.503496503496503,
142
- "grad_norm": 2.574584484100342,
143
- "learning_rate": 0.00023104347826086955,
144
- "loss": 3.6469,
145
  "step": 900
146
  },
147
  {
148
  "epoch": 12.503496503496503,
149
- "eval_cer": 0.9906922629435718,
150
- "eval_loss": 3.6921467781066895,
151
- "eval_runtime": 6.0896,
152
- "eval_samples_per_second": 46.801,
153
- "eval_steps_per_second": 5.912,
154
  "step": 900
155
  },
156
  {
157
  "epoch": 13.895104895104895,
158
- "grad_norm": 0.901987612247467,
159
- "learning_rate": 0.00022234782608695648,
160
- "loss": 3.6337,
161
  "step": 1000
162
  },
163
  {
164
  "epoch": 13.895104895104895,
165
- "eval_cer": 0.9897227070001939,
166
- "eval_loss": 3.638920307159424,
167
- "eval_runtime": 6.0087,
168
- "eval_samples_per_second": 47.432,
169
- "eval_steps_per_second": 5.991,
170
  "step": 1000
171
  },
172
  {
173
  "epoch": 15.27972027972028,
174
- "grad_norm": 0.6472112536430359,
175
- "learning_rate": 0.00021365217391304344,
176
- "loss": 3.5753,
177
  "step": 1100
178
  },
179
  {
180
  "epoch": 15.27972027972028,
181
- "eval_cer": 0.9870079503587357,
182
- "eval_loss": 3.5900485515594482,
183
- "eval_runtime": 6.0832,
184
- "eval_samples_per_second": 46.85,
185
- "eval_steps_per_second": 5.918,
186
  "step": 1100
187
  },
188
  {
189
  "epoch": 16.67132867132867,
190
- "grad_norm": 0.6265191435813904,
191
- "learning_rate": 0.00020495652173913042,
192
- "loss": 3.5827,
193
  "step": 1200
194
  },
195
  {
196
  "epoch": 16.67132867132867,
197
- "eval_cer": 0.9697498545666084,
198
- "eval_loss": 3.5273666381835938,
199
- "eval_runtime": 6.0422,
200
- "eval_samples_per_second": 47.168,
201
- "eval_steps_per_second": 5.958,
202
  "step": 1200
203
  },
204
  {
205
  "epoch": 18.055944055944057,
206
- "grad_norm": 0.3403220772743225,
207
- "learning_rate": 0.00019626086956521738,
208
- "loss": 3.4855,
209
  "step": 1300
210
  },
211
  {
212
  "epoch": 18.055944055944057,
213
- "eval_cer": 0.9660655419817723,
214
- "eval_loss": 3.4091908931732178,
215
- "eval_runtime": 6.0742,
216
- "eval_samples_per_second": 46.92,
217
- "eval_steps_per_second": 5.927,
218
  "step": 1300
219
  },
220
  {
221
  "epoch": 19.447552447552447,
222
- "grad_norm": 1.162210464477539,
223
- "learning_rate": 0.00018756521739130434,
224
- "loss": 3.4311,
225
  "step": 1400
226
  },
227
  {
228
  "epoch": 19.447552447552447,
229
- "eval_cer": 0.9744037230948226,
230
- "eval_loss": 3.344524383544922,
231
- "eval_runtime": 6.0671,
232
- "eval_samples_per_second": 46.974,
233
- "eval_steps_per_second": 5.934,
234
  "step": 1400
235
  },
236
  {
237
  "epoch": 20.83916083916084,
238
- "grad_norm": 0.8258137702941895,
239
- "learning_rate": 0.0001788695652173913,
240
- "loss": 3.3671,
241
  "step": 1500
242
  },
243
  {
244
  "epoch": 20.83916083916084,
245
- "eval_cer": 0.9647081636610433,
246
- "eval_loss": 3.2508041858673096,
247
- "eval_runtime": 6.0386,
248
- "eval_samples_per_second": 47.196,
249
- "eval_steps_per_second": 5.962,
250
  "step": 1500
251
  },
252
  {
253
  "epoch": 22.223776223776223,
254
- "grad_norm": 1.6096532344818115,
255
- "learning_rate": 0.00017017391304347825,
256
- "loss": 3.2572,
257
  "step": 1600
258
  },
259
  {
260
  "epoch": 22.223776223776223,
261
- "eval_cer": 0.9697498545666084,
262
- "eval_loss": 3.1160073280334473,
263
- "eval_runtime": 6.0443,
264
- "eval_samples_per_second": 47.152,
265
- "eval_steps_per_second": 5.956,
266
  "step": 1600
267
  },
268
  {
269
  "epoch": 23.615384615384617,
270
- "grad_norm": 1.1651092767715454,
271
- "learning_rate": 0.00016147826086956518,
272
- "loss": 3.1242,
273
  "step": 1700
274
  },
275
  {
276
  "epoch": 23.615384615384617,
277
- "eval_cer": 0.9538491370952104,
278
- "eval_loss": 2.8400416374206543,
279
- "eval_runtime": 6.0687,
280
- "eval_samples_per_second": 46.962,
281
- "eval_steps_per_second": 5.932,
282
  "step": 1700
283
  },
284
  {
285
  "epoch": 25.0,
286
- "grad_norm": 1.9780901670455933,
287
- "learning_rate": 0.00015278260869565217,
288
- "loss": 2.9165,
289
  "step": 1800
290
  },
291
  {
292
  "epoch": 25.0,
293
- "eval_cer": 0.8658134574364941,
294
- "eval_loss": 2.5779905319213867,
295
- "eval_runtime": 6.0337,
296
- "eval_samples_per_second": 47.235,
297
- "eval_steps_per_second": 5.967,
298
  "step": 1800
299
  },
300
  {
301
  "epoch": 26.39160839160839,
302
- "grad_norm": 1.7610468864440918,
303
- "learning_rate": 0.00014408695652173912,
304
- "loss": 2.677,
305
  "step": 1900
306
  },
307
  {
308
  "epoch": 26.39160839160839,
309
- "eval_cer": 0.7663370176459182,
310
- "eval_loss": 2.2887232303619385,
311
- "eval_runtime": 6.0359,
312
- "eval_samples_per_second": 47.218,
313
- "eval_steps_per_second": 5.964,
314
  "step": 1900
315
  },
316
  {
317
  "epoch": 27.783216783216783,
318
- "grad_norm": 1.3860621452331543,
319
- "learning_rate": 0.00013539130434782608,
320
- "loss": 2.4528,
321
  "step": 2000
322
  },
323
  {
324
  "epoch": 27.783216783216783,
325
- "eval_cer": 0.7149505526468877,
326
- "eval_loss": 2.057708501815796,
327
- "eval_runtime": 6.0733,
328
- "eval_samples_per_second": 46.927,
329
- "eval_steps_per_second": 5.928,
330
  "step": 2000
331
  },
332
  {
333
  "epoch": 29.167832167832167,
334
- "grad_norm": 1.6301653385162354,
335
- "learning_rate": 0.00012669565217391304,
336
- "loss": 2.2515,
337
  "step": 2100
338
  },
339
  {
340
  "epoch": 29.167832167832167,
341
- "eval_cer": 0.7130114407601319,
342
- "eval_loss": 1.8436604738235474,
343
- "eval_runtime": 6.0686,
344
- "eval_samples_per_second": 46.963,
345
- "eval_steps_per_second": 5.932,
346
  "step": 2100
347
  },
348
  {
349
  "epoch": 30.55944055944056,
350
- "grad_norm": 2.4235737323760986,
351
- "learning_rate": 0.00011799999999999998,
352
- "loss": 2.0648,
353
  "step": 2200
354
  },
355
  {
356
  "epoch": 30.55944055944056,
357
- "eval_cer": 0.6420399457048672,
358
- "eval_loss": 1.7248642444610596,
359
- "eval_runtime": 6.056,
360
- "eval_samples_per_second": 47.061,
361
- "eval_steps_per_second": 5.944,
362
  "step": 2200
363
  },
364
  {
365
  "epoch": 31.95104895104895,
366
- "grad_norm": 5.94592809677124,
367
- "learning_rate": 0.00010930434782608695,
368
- "loss": 1.9139,
369
  "step": 2300
370
  },
371
  {
372
  "epoch": 31.95104895104895,
373
- "eval_cer": 0.6655031995346131,
374
- "eval_loss": 1.7222360372543335,
375
- "eval_runtime": 6.0486,
376
- "eval_samples_per_second": 47.119,
377
- "eval_steps_per_second": 5.952,
378
  "step": 2300
379
  },
380
  {
381
  "epoch": 33.33566433566433,
382
- "grad_norm": 2.144540309906006,
383
- "learning_rate": 0.00010060869565217391,
384
- "loss": 1.7839,
385
  "step": 2400
386
  },
387
  {
388
  "epoch": 33.33566433566433,
389
- "eval_cer": 0.584448322668218,
390
- "eval_loss": 1.5456085205078125,
391
- "eval_runtime": 6.0417,
392
- "eval_samples_per_second": 47.172,
393
- "eval_steps_per_second": 5.959,
394
  "step": 2400
395
  },
396
  {
397
  "epoch": 34.72727272727273,
398
- "grad_norm": 2.905297040939331,
399
- "learning_rate": 9.191304347826085e-05,
400
- "loss": 1.6434,
401
  "step": 2500
402
  },
403
  {
404
  "epoch": 34.72727272727273,
405
- "eval_cer": 0.5708745394609269,
406
- "eval_loss": 1.5248050689697266,
407
- "eval_runtime": 6.0519,
408
- "eval_samples_per_second": 47.092,
409
- "eval_steps_per_second": 5.949,
410
  "step": 2500
411
  },
412
  {
413
  "epoch": 36.11188811188811,
414
- "grad_norm": 2.356537103652954,
415
- "learning_rate": 8.321739130434782e-05,
416
- "loss": 1.518,
417
  "step": 2600
418
  },
419
  {
420
  "epoch": 36.11188811188811,
421
- "eval_cer": 0.5243358541787861,
422
- "eval_loss": 1.394640564918518,
423
- "eval_runtime": 6.0483,
424
- "eval_samples_per_second": 47.121,
425
- "eval_steps_per_second": 5.952,
426
  "step": 2600
427
  },
428
  {
429
  "epoch": 37.50349650349651,
430
- "grad_norm": 1.8658636808395386,
431
- "learning_rate": 7.452173913043478e-05,
432
- "loss": 1.4163,
433
  "step": 2700
434
  },
435
  {
436
  "epoch": 37.50349650349651,
437
- "eval_cer": 0.5369400814426992,
438
- "eval_loss": 1.3937088251113892,
439
- "eval_runtime": 6.0582,
440
- "eval_samples_per_second": 47.044,
441
- "eval_steps_per_second": 5.942,
442
  "step": 2700
443
  },
444
  {
445
  "epoch": 38.89510489510489,
446
- "grad_norm": 2.7042293548583984,
447
- "learning_rate": 6.582608695652174e-05,
448
- "loss": 1.3139,
449
  "step": 2800
450
  },
451
  {
452
  "epoch": 38.89510489510489,
453
- "eval_cer": 0.5115377157261974,
454
- "eval_loss": 1.4022786617279053,
455
- "eval_runtime": 6.0783,
456
- "eval_samples_per_second": 46.888,
457
- "eval_steps_per_second": 5.923,
458
  "step": 2800
459
  },
460
  {
461
  "epoch": 40.27972027972028,
462
- "grad_norm": 2.1998214721679688,
463
- "learning_rate": 5.713043478260869e-05,
464
- "loss": 1.2228,
465
  "step": 2900
466
  },
467
  {
468
  "epoch": 40.27972027972028,
469
- "eval_cer": 0.506108202443281,
470
- "eval_loss": 1.3531999588012695,
471
- "eval_runtime": 6.0558,
472
- "eval_samples_per_second": 47.063,
473
- "eval_steps_per_second": 5.945,
474
  "step": 2900
475
  },
476
  {
477
  "epoch": 41.67132867132867,
478
- "grad_norm": 3.541335105895996,
479
- "learning_rate": 4.8434782608695647e-05,
480
- "loss": 1.126,
481
  "step": 3000
482
  },
483
  {
484
  "epoch": 41.67132867132867,
485
- "eval_cer": 0.49912739965095987,
486
- "eval_loss": 1.418116569519043,
487
- "eval_runtime": 6.0781,
488
- "eval_samples_per_second": 46.889,
489
- "eval_steps_per_second": 5.923,
490
  "step": 3000
491
  },
492
  {
493
  "epoch": 43.05594405594405,
494
- "grad_norm": 1.7935446500778198,
495
- "learning_rate": 3.9739130434782604e-05,
496
- "loss": 1.0416,
497
  "step": 3100
498
  },
499
  {
500
  "epoch": 43.05594405594405,
501
- "eval_cer": 0.4900135737832073,
502
- "eval_loss": 1.4075652360916138,
503
- "eval_runtime": 6.0943,
504
- "eval_samples_per_second": 46.765,
505
- "eval_steps_per_second": 5.907,
506
  "step": 3100
507
  },
508
  {
509
  "epoch": 44.44755244755245,
510
- "grad_norm": 4.2039570808410645,
511
- "learning_rate": 3.104347826086956e-05,
512
- "loss": 0.9605,
513
  "step": 3200
514
  },
515
  {
516
  "epoch": 44.44755244755245,
517
- "eval_cer": 0.49234050804731433,
518
- "eval_loss": 1.4471503496170044,
519
- "eval_runtime": 6.0413,
520
- "eval_samples_per_second": 47.175,
521
- "eval_steps_per_second": 5.959,
522
  "step": 3200
523
  },
524
  {
525
  "epoch": 45.83916083916084,
526
- "grad_norm": 2.918208360671997,
527
- "learning_rate": 2.2347826086956522e-05,
528
- "loss": 0.9086,
529
  "step": 3300
530
  },
531
  {
532
  "epoch": 45.83916083916084,
533
- "eval_cer": 0.48458406050029085,
534
- "eval_loss": 1.4480254650115967,
535
- "eval_runtime": 6.0766,
536
- "eval_samples_per_second": 46.901,
537
- "eval_steps_per_second": 5.924,
538
  "step": 3300
539
  },
540
  {
541
  "epoch": 47.22377622377623,
542
- "grad_norm": 6.653832912445068,
543
- "learning_rate": 1.3652173913043477e-05,
544
- "loss": 0.8519,
545
  "step": 3400
546
  },
547
  {
548
  "epoch": 47.22377622377623,
549
- "eval_cer": 0.4810936591041303,
550
- "eval_loss": 1.458249568939209,
551
- "eval_runtime": 6.0299,
552
- "eval_samples_per_second": 47.264,
553
- "eval_steps_per_second": 5.97,
554
  "step": 3400
555
  },
556
  {
557
  "epoch": 48.61538461538461,
558
- "grad_norm": 2.209395408630371,
559
- "learning_rate": 4.956521739130434e-06,
560
- "loss": 0.8244,
561
  "step": 3500
562
  },
563
  {
564
  "epoch": 48.61538461538461,
565
- "eval_cer": 0.4832266821795618,
566
- "eval_loss": 1.4674257040023804,
567
- "eval_runtime": 6.064,
568
- "eval_samples_per_second": 46.998,
569
- "eval_steps_per_second": 5.937,
570
  "step": 3500
571
  },
572
  {
573
- "epoch": 49.30769230769231,
574
- "step": 3550,
575
- "total_flos": 7.824800015175002e+18,
576
- "train_loss": 2.784584909358495,
577
- "train_runtime": 4913.0254,
578
- "train_samples_per_second": 23.214,
579
- "train_steps_per_second": 0.723
580
  },
581
  {
582
- "epoch": 49.30769230769231,
583
- "eval_cer": 0.47915454721737444,
584
- "eval_loss": 1.4787533283233643,
585
- "eval_runtime": 6.0551,
586
- "eval_samples_per_second": 47.068,
587
- "eval_steps_per_second": 5.945,
588
- "step": 3550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
  }
590
  ],
591
  "logging_steps": 100,
592
- "max_steps": 3550,
593
  "num_input_tokens_seen": 0,
594
- "num_train_epochs": 50,
595
  "save_steps": 100,
596
  "stateful_callbacks": {
597
  "TrainerControl": {
@@ -605,7 +1181,7 @@
605
  "attributes": {}
606
  }
607
  },
608
- "total_flos": 7.824800015175002e+18,
609
  "train_batch_size": 16,
610
  "trial_name": null,
611
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 98.61538461538461,
6
  "eval_steps": 100,
7
+ "global_step": 7100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.3916083916083917,
14
+ "grad_norm": 0.7105618119239807,
15
  "learning_rate": 0.000285,
16
+ "loss": 11.1682,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 1.3916083916083917,
21
+ "eval_cer": 0.9903044405662207,
22
+ "eval_loss": 3.855257987976074,
23
+ "eval_runtime": 6.2271,
24
+ "eval_samples_per_second": 45.768,
25
+ "eval_steps_per_second": 5.781,
26
  "step": 100
27
  },
28
  {
29
  "epoch": 2.7832167832167833,
30
+ "grad_norm": 1.5593620538711548,
31
+ "learning_rate": 0.0002959285714285714,
32
+ "loss": 3.9311,
33
  "step": 200
34
  },
35
  {
36
  "epoch": 2.7832167832167833,
37
+ "eval_cer": 0.9903044405662207,
38
+ "eval_loss": 3.8240556716918945,
39
+ "eval_runtime": 6.1912,
40
+ "eval_samples_per_second": 46.033,
41
+ "eval_steps_per_second": 5.815,
42
  "step": 200
43
  },
44
  {
45
  "epoch": 4.1678321678321675,
46
+ "grad_norm": 1.5898423194885254,
47
+ "learning_rate": 0.0002916428571428571,
48
+ "loss": 3.8623,
49
  "step": 300
50
  },
51
  {
52
  "epoch": 4.1678321678321675,
53
+ "eval_cer": 0.9903044405662207,
54
+ "eval_loss": 3.7760262489318848,
55
+ "eval_runtime": 6.1779,
56
+ "eval_samples_per_second": 46.132,
57
+ "eval_steps_per_second": 5.827,
58
  "step": 300
59
  },
60
  {
61
  "epoch": 5.559440559440559,
62
+ "grad_norm": 2.1614158153533936,
63
+ "learning_rate": 0.00028735714285714286,
64
+ "loss": 3.7693,
65
  "step": 400
66
  },
67
  {
68
  "epoch": 5.559440559440559,
69
  "eval_cer": 0.9903044405662207,
70
+ "eval_loss": 3.668625831604004,
71
+ "eval_runtime": 6.1599,
72
+ "eval_samples_per_second": 46.267,
73
+ "eval_steps_per_second": 5.844,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 6.951048951048951,
78
+ "grad_norm": 1.3560220003128052,
79
+ "learning_rate": 0.00028307142857142854,
80
+ "loss": 3.671,
81
  "step": 500
82
  },
83
  {
84
  "epoch": 6.951048951048951,
85
+ "eval_cer": 0.9893348846228427,
86
+ "eval_loss": 3.590017080307007,
87
+ "eval_runtime": 6.1396,
88
+ "eval_samples_per_second": 46.42,
89
+ "eval_steps_per_second": 5.864,
90
  "step": 500
91
  },
92
  {
93
  "epoch": 8.335664335664335,
94
+ "grad_norm": 1.1032301187515259,
95
+ "learning_rate": 0.00027878571428571427,
96
+ "loss": 3.5618,
97
  "step": 600
98
  },
99
  {
100
  "epoch": 8.335664335664335,
101
+ "eval_cer": 0.9713011440760132,
102
+ "eval_loss": 3.5168509483337402,
103
+ "eval_runtime": 6.166,
104
+ "eval_samples_per_second": 46.221,
105
+ "eval_steps_per_second": 5.838,
106
  "step": 600
107
  },
108
  {
109
  "epoch": 9.727272727272727,
110
+ "grad_norm": 1.4139310121536255,
111
+ "learning_rate": 0.0002745,
112
+ "loss": 3.4994,
113
  "step": 700
114
  },
115
  {
116
  "epoch": 9.727272727272727,
117
+ "eval_cer": 0.9699437657552841,
118
+ "eval_loss": 3.3551578521728516,
119
+ "eval_runtime": 6.1552,
120
+ "eval_samples_per_second": 46.302,
121
+ "eval_steps_per_second": 5.849,
122
  "step": 700
123
  },
124
  {
125
  "epoch": 11.111888111888112,
126
+ "grad_norm": 0.8385369777679443,
127
+ "learning_rate": 0.0002702142857142857,
128
+ "loss": 3.3323,
129
  "step": 800
130
  },
131
  {
132
  "epoch": 11.111888111888112,
133
+ "eval_cer": 0.954043048283886,
134
+ "eval_loss": 3.138484001159668,
135
+ "eval_runtime": 6.0631,
136
+ "eval_samples_per_second": 47.005,
137
+ "eval_steps_per_second": 5.938,
138
  "step": 800
139
  },
140
  {
141
  "epoch": 12.503496503496503,
142
+ "grad_norm": 3.5932364463806152,
143
+ "learning_rate": 0.0002659285714285714,
144
+ "loss": 3.163,
145
  "step": 900
146
  },
147
  {
148
  "epoch": 12.503496503496503,
149
+ "eval_cer": 0.9185573007562536,
150
+ "eval_loss": 2.9224491119384766,
151
+ "eval_runtime": 6.0879,
152
+ "eval_samples_per_second": 46.814,
153
+ "eval_steps_per_second": 5.913,
154
  "step": 900
155
  },
156
  {
157
  "epoch": 13.895104895104895,
158
+ "grad_norm": 1.8399698734283447,
159
+ "learning_rate": 0.0002616428571428571,
160
+ "loss": 2.7901,
161
  "step": 1000
162
  },
163
  {
164
  "epoch": 13.895104895104895,
165
+ "eval_cer": 0.7828194686833431,
166
+ "eval_loss": 2.180238723754883,
167
+ "eval_runtime": 6.0259,
168
+ "eval_samples_per_second": 47.296,
169
+ "eval_steps_per_second": 5.974,
170
  "step": 1000
171
  },
172
  {
173
  "epoch": 15.27972027972028,
174
+ "grad_norm": 1.625369668006897,
175
+ "learning_rate": 0.00025735714285714283,
176
+ "loss": 2.3425,
177
  "step": 1100
178
  },
179
  {
180
  "epoch": 15.27972027972028,
181
+ "eval_cer": 0.6528989722707,
182
+ "eval_loss": 1.8405641317367554,
183
+ "eval_runtime": 6.0991,
184
+ "eval_samples_per_second": 46.728,
185
+ "eval_steps_per_second": 5.903,
186
  "step": 1100
187
  },
188
  {
189
  "epoch": 16.67132867132867,
190
+ "grad_norm": 2.625293016433716,
191
+ "learning_rate": 0.0002530714285714285,
192
+ "loss": 2.0608,
193
  "step": 1200
194
  },
195
  {
196
  "epoch": 16.67132867132867,
197
+ "eval_cer": 0.6329261198371146,
198
+ "eval_loss": 1.6505399942398071,
199
+ "eval_runtime": 6.1263,
200
+ "eval_samples_per_second": 46.52,
201
+ "eval_steps_per_second": 5.876,
202
  "step": 1200
203
  },
204
  {
205
  "epoch": 18.055944055944057,
206
+ "grad_norm": 1.92220139503479,
207
+ "learning_rate": 0.00024878571428571425,
208
+ "loss": 1.8813,
209
  "step": 1300
210
  },
211
  {
212
  "epoch": 18.055944055944057,
213
+ "eval_cer": 0.5714562730269537,
214
+ "eval_loss": 1.4768792390823364,
215
+ "eval_runtime": 6.1089,
216
+ "eval_samples_per_second": 46.653,
217
+ "eval_steps_per_second": 5.893,
218
  "step": 1300
219
  },
220
  {
221
  "epoch": 19.447552447552447,
222
+ "grad_norm": 3.1366982460021973,
223
+ "learning_rate": 0.0002445,
224
+ "loss": 1.6705,
225
  "step": 1400
226
  },
227
  {
228
  "epoch": 19.447552447552447,
229
+ "eval_cer": 0.5580764010083382,
230
+ "eval_loss": 1.479285478591919,
231
+ "eval_runtime": 6.1149,
232
+ "eval_samples_per_second": 46.608,
233
+ "eval_steps_per_second": 5.887,
234
  "step": 1400
235
  },
236
  {
237
  "epoch": 20.83916083916084,
238
+ "grad_norm": 2.116931200027466,
239
+ "learning_rate": 0.0002402142857142857,
240
+ "loss": 1.558,
241
  "step": 1500
242
  },
243
  {
244
  "epoch": 20.83916083916084,
245
+ "eval_cer": 0.4969943765755284,
246
+ "eval_loss": 1.3079291582107544,
247
+ "eval_runtime": 6.0774,
248
+ "eval_samples_per_second": 46.895,
249
+ "eval_steps_per_second": 5.924,
250
  "step": 1500
251
  },
252
  {
253
  "epoch": 22.223776223776223,
254
+ "grad_norm": 4.369190692901611,
255
+ "learning_rate": 0.00023592857142857142,
256
+ "loss": 1.4213,
257
  "step": 1600
258
  },
259
  {
260
  "epoch": 22.223776223776223,
261
+ "eval_cer": 0.49466744231142135,
262
+ "eval_loss": 1.3551599979400635,
263
+ "eval_runtime": 6.1085,
264
+ "eval_samples_per_second": 46.657,
265
+ "eval_steps_per_second": 5.893,
266
  "step": 1600
267
  },
268
  {
269
  "epoch": 23.615384615384617,
270
+ "grad_norm": 2.938127279281616,
271
+ "learning_rate": 0.00023164285714285713,
272
+ "loss": 1.3122,
273
  "step": 1700
274
  },
275
  {
276
  "epoch": 23.615384615384617,
277
+ "eval_cer": 0.4355245297653675,
278
+ "eval_loss": 1.236782431602478,
279
+ "eval_runtime": 6.1138,
280
+ "eval_samples_per_second": 46.616,
281
+ "eval_steps_per_second": 5.888,
282
  "step": 1700
283
  },
284
  {
285
  "epoch": 25.0,
286
+ "grad_norm": 4.156201362609863,
287
+ "learning_rate": 0.00022735714285714286,
288
+ "loss": 1.2303,
289
  "step": 1800
290
  },
291
  {
292
  "epoch": 25.0,
293
+ "eval_cer": 0.4347488850106651,
294
+ "eval_loss": 1.210758924484253,
295
+ "eval_runtime": 6.0871,
296
+ "eval_samples_per_second": 46.82,
297
+ "eval_steps_per_second": 5.914,
298
  "step": 1800
299
  },
300
  {
301
  "epoch": 26.39160839160839,
302
+ "grad_norm": 3.738050937652588,
303
+ "learning_rate": 0.00022307142857142854,
304
+ "loss": 1.1152,
305
  "step": 1900
306
  },
307
  {
308
  "epoch": 26.39160839160839,
309
+ "eval_cer": 0.4306767500484778,
310
+ "eval_loss": 1.2177391052246094,
311
+ "eval_runtime": 6.1061,
312
+ "eval_samples_per_second": 46.674,
313
+ "eval_steps_per_second": 5.896,
314
  "step": 1900
315
  },
316
  {
317
  "epoch": 27.783216783216783,
318
+ "grad_norm": 3.721745014190674,
319
+ "learning_rate": 0.00021878571428571428,
320
+ "loss": 1.0441,
321
  "step": 2000
322
  },
323
  {
324
  "epoch": 27.783216783216783,
325
+ "eval_cer": 0.4291254605390731,
326
+ "eval_loss": 1.3235960006713867,
327
+ "eval_runtime": 6.0708,
328
+ "eval_samples_per_second": 46.946,
329
+ "eval_steps_per_second": 5.93,
330
  "step": 2000
331
  },
332
  {
333
  "epoch": 29.167832167832167,
334
+ "grad_norm": 2.746555805206299,
335
+ "learning_rate": 0.00021449999999999998,
336
+ "loss": 0.9626,
337
  "step": 2100
338
  },
339
  {
340
  "epoch": 29.167832167832167,
341
+ "eval_cer": 0.4157455885204576,
342
+ "eval_loss": 1.2737609148025513,
343
+ "eval_runtime": 6.1093,
344
+ "eval_samples_per_second": 46.65,
345
+ "eval_steps_per_second": 5.893,
346
  "step": 2100
347
  },
348
  {
349
  "epoch": 30.55944055944056,
350
+ "grad_norm": 2.8237345218658447,
351
+ "learning_rate": 0.0002102142857142857,
352
+ "loss": 0.8987,
353
  "step": 2200
354
  },
355
  {
356
  "epoch": 30.55944055944056,
357
+ "eval_cer": 0.4190420787279426,
358
+ "eval_loss": 1.2683167457580566,
359
+ "eval_runtime": 6.1368,
360
+ "eval_samples_per_second": 46.441,
361
+ "eval_steps_per_second": 5.866,
362
  "step": 2200
363
  },
364
  {
365
  "epoch": 31.95104895104895,
366
+ "grad_norm": 5.561631679534912,
367
+ "learning_rate": 0.0002059285714285714,
368
+ "loss": 0.8367,
369
  "step": 2300
370
  },
371
  {
372
  "epoch": 31.95104895104895,
373
+ "eval_cer": 0.41438821019972855,
374
+ "eval_loss": 1.2570189237594604,
375
+ "eval_runtime": 6.095,
376
+ "eval_samples_per_second": 46.76,
377
+ "eval_steps_per_second": 5.906,
378
  "step": 2300
379
  },
380
  {
381
  "epoch": 33.33566433566433,
382
+ "grad_norm": 1.7682024240493774,
383
+ "learning_rate": 0.00020164285714285713,
384
+ "loss": 0.7617,
385
  "step": 2400
386
  },
387
  {
388
  "epoch": 33.33566433566433,
389
+ "eval_cer": 0.3876284661624976,
390
+ "eval_loss": 1.233074426651001,
391
+ "eval_runtime": 6.0871,
392
+ "eval_samples_per_second": 46.821,
393
+ "eval_steps_per_second": 5.914,
394
  "step": 2400
395
  },
396
  {
397
  "epoch": 34.72727272727273,
398
+ "grad_norm": 3.293351888656616,
399
+ "learning_rate": 0.00019735714285714284,
400
+ "loss": 0.7069,
401
  "step": 2500
402
  },
403
  {
404
  "epoch": 34.72727272727273,
405
+ "eval_cer": 0.40372309482257124,
406
+ "eval_loss": 1.328414797782898,
407
+ "eval_runtime": 6.0987,
408
+ "eval_samples_per_second": 46.731,
409
+ "eval_steps_per_second": 5.903,
410
  "step": 2500
411
  },
412
  {
413
  "epoch": 36.11188811188811,
414
+ "grad_norm": 1.8948358297348022,
415
+ "learning_rate": 0.00019307142857142854,
416
+ "loss": 0.6874,
417
  "step": 2600
418
  },
419
  {
420
  "epoch": 36.11188811188811,
421
+ "eval_cer": 0.38181113050223,
422
+ "eval_loss": 1.2947708368301392,
423
+ "eval_runtime": 6.0589,
424
+ "eval_samples_per_second": 47.038,
425
+ "eval_steps_per_second": 5.942,
426
  "step": 2600
427
  },
428
  {
429
  "epoch": 37.50349650349651,
430
+ "grad_norm": 2.3135173320770264,
431
+ "learning_rate": 0.00018878571428571428,
432
+ "loss": 0.6615,
433
  "step": 2700
434
  },
435
  {
436
  "epoch": 37.50349650349651,
437
+ "eval_cer": 0.3977118479736281,
438
+ "eval_loss": 1.299822211265564,
439
+ "eval_runtime": 6.0553,
440
+ "eval_samples_per_second": 47.066,
441
+ "eval_steps_per_second": 5.945,
442
  "step": 2700
443
  },
444
  {
445
  "epoch": 38.89510489510489,
446
+ "grad_norm": 8.07669448852539,
447
+ "learning_rate": 0.00018449999999999999,
448
+ "loss": 0.6086,
449
  "step": 2800
450
  },
451
  {
452
  "epoch": 38.89510489510489,
453
+ "eval_cer": 0.3757998836532868,
454
+ "eval_loss": 1.3369208574295044,
455
+ "eval_runtime": 6.0593,
456
+ "eval_samples_per_second": 47.035,
457
+ "eval_steps_per_second": 5.941,
458
  "step": 2800
459
  },
460
  {
461
  "epoch": 40.27972027972028,
462
+ "grad_norm": 2.3282470703125,
463
+ "learning_rate": 0.00018021428571428572,
464
+ "loss": 0.5804,
465
  "step": 2900
466
  },
467
  {
468
  "epoch": 40.27972027972028,
469
+ "eval_cer": 0.38375024238898586,
470
+ "eval_loss": 1.2814927101135254,
471
+ "eval_runtime": 6.0834,
472
+ "eval_samples_per_second": 46.849,
473
+ "eval_steps_per_second": 5.918,
474
  "step": 2900
475
  },
476
  {
477
  "epoch": 41.67132867132867,
478
+ "grad_norm": 5.158154010772705,
479
+ "learning_rate": 0.0001759285714285714,
480
+ "loss": 0.548,
481
  "step": 3000
482
  },
483
  {
484
  "epoch": 41.67132867132867,
485
+ "eval_cer": 0.37657552840798914,
486
+ "eval_loss": 1.3390411138534546,
487
+ "eval_runtime": 6.0871,
488
+ "eval_samples_per_second": 46.82,
489
+ "eval_steps_per_second": 5.914,
490
  "step": 3000
491
  },
492
  {
493
  "epoch": 43.05594405594405,
494
+ "grad_norm": 1.2800214290618896,
495
+ "learning_rate": 0.00017164285714285713,
496
+ "loss": 0.5239,
497
  "step": 3100
498
  },
499
  {
500
  "epoch": 43.05594405594405,
501
+ "eval_cer": 0.367267791351561,
502
+ "eval_loss": 1.257192850112915,
503
+ "eval_runtime": 6.0964,
504
+ "eval_samples_per_second": 46.749,
505
+ "eval_steps_per_second": 5.905,
506
  "step": 3100
507
  },
508
  {
509
  "epoch": 44.44755244755245,
510
+ "grad_norm": 4.9716010093688965,
511
+ "learning_rate": 0.00016735714285714284,
512
+ "loss": 0.4983,
513
  "step": 3200
514
  },
515
  {
516
  "epoch": 44.44755244755245,
517
+ "eval_cer": 0.3670738801628854,
518
+ "eval_loss": 1.295488715171814,
519
+ "eval_runtime": 6.034,
520
+ "eval_samples_per_second": 47.232,
521
+ "eval_steps_per_second": 5.966,
522
  "step": 3200
523
  },
524
  {
525
  "epoch": 45.83916083916084,
526
+ "grad_norm": 2.536774158477783,
527
+ "learning_rate": 0.0001631142857142857,
528
+ "loss": 0.4793,
529
  "step": 3300
530
  },
531
  {
532
  "epoch": 45.83916083916084,
533
+ "eval_cer": 0.372891215823153,
534
+ "eval_loss": 1.3562514781951904,
535
+ "eval_runtime": 6.0912,
536
+ "eval_samples_per_second": 46.789,
537
+ "eval_steps_per_second": 5.91,
538
  "step": 3300
539
  },
540
  {
541
  "epoch": 47.22377622377623,
542
+ "grad_norm": 3.6583845615386963,
543
+ "learning_rate": 0.00015882857142857142,
544
+ "loss": 0.438,
545
  "step": 3400
546
  },
547
  {
548
  "epoch": 47.22377622377623,
549
+ "eval_cer": 0.3915066899360093,
550
+ "eval_loss": 1.415280818939209,
551
+ "eval_runtime": 6.0618,
552
+ "eval_samples_per_second": 47.016,
553
+ "eval_steps_per_second": 5.939,
554
  "step": 3400
555
  },
556
  {
557
  "epoch": 48.61538461538461,
558
+ "grad_norm": 2.472052574157715,
559
+ "learning_rate": 0.00015454285714285712,
560
+ "loss": 0.4274,
561
  "step": 3500
562
  },
563
  {
564
  "epoch": 48.61538461538461,
565
+ "eval_cer": 0.36629823540818307,
566
+ "eval_loss": 1.319765567779541,
567
+ "eval_runtime": 6.1106,
568
+ "eval_samples_per_second": 46.64,
569
+ "eval_steps_per_second": 5.891,
570
  "step": 3500
571
  },
572
  {
573
+ "epoch": 50.0,
574
+ "grad_norm": 7.336581707000732,
575
+ "learning_rate": 0.00015025714285714286,
576
+ "loss": 0.4064,
577
+ "step": 3600
 
 
578
  },
579
  {
580
+ "epoch": 50.0,
581
+ "eval_cer": 0.3814233081248788,
582
+ "eval_loss": 1.4350632429122925,
583
+ "eval_runtime": 6.098,
584
+ "eval_samples_per_second": 46.736,
585
+ "eval_steps_per_second": 5.904,
586
+ "step": 3600
587
+ },
588
+ {
589
+ "epoch": 51.39160839160839,
590
+ "grad_norm": 2.7026124000549316,
591
+ "learning_rate": 0.00014597142857142856,
592
+ "loss": 0.3812,
593
+ "step": 3700
594
+ },
595
+ {
596
+ "epoch": 51.39160839160839,
597
+ "eval_cer": 0.36203218925732017,
598
+ "eval_loss": 1.351439356803894,
599
+ "eval_runtime": 6.0572,
600
+ "eval_samples_per_second": 47.051,
601
+ "eval_steps_per_second": 5.943,
602
+ "step": 3700
603
+ },
604
+ {
605
+ "epoch": 52.78321678321678,
606
+ "grad_norm": 1.425048589706421,
607
+ "learning_rate": 0.00014168571428571427,
608
+ "loss": 0.3753,
609
+ "step": 3800
610
+ },
611
+ {
612
+ "epoch": 52.78321678321678,
613
+ "eval_cer": 0.3492340508047314,
614
+ "eval_loss": 1.3715204000473022,
615
+ "eval_runtime": 6.0504,
616
+ "eval_samples_per_second": 47.104,
617
+ "eval_steps_per_second": 5.95,
618
+ "step": 3800
619
+ },
620
+ {
621
+ "epoch": 54.16783216783217,
622
+ "grad_norm": 2.945066452026367,
623
+ "learning_rate": 0.0001374,
624
+ "loss": 0.3549,
625
+ "step": 3900
626
+ },
627
+ {
628
+ "epoch": 54.16783216783217,
629
+ "eval_cer": 0.36494085708745394,
630
+ "eval_loss": 1.4132966995239258,
631
+ "eval_runtime": 6.0971,
632
+ "eval_samples_per_second": 46.743,
633
+ "eval_steps_per_second": 5.904,
634
+ "step": 3900
635
+ },
636
+ {
637
+ "epoch": 55.55944055944056,
638
+ "grad_norm": 1.2087554931640625,
639
+ "learning_rate": 0.0001331142857142857,
640
+ "loss": 0.3262,
641
+ "step": 4000
642
+ },
643
+ {
644
+ "epoch": 55.55944055944056,
645
+ "eval_cer": 0.3573783207291061,
646
+ "eval_loss": 1.4259963035583496,
647
+ "eval_runtime": 6.0615,
648
+ "eval_samples_per_second": 47.018,
649
+ "eval_steps_per_second": 5.939,
650
+ "step": 4000
651
+ },
652
+ {
653
+ "epoch": 56.95104895104895,
654
+ "grad_norm": 6.813267230987549,
655
+ "learning_rate": 0.00012882857142857142,
656
+ "loss": 0.3296,
657
+ "step": 4100
658
+ },
659
+ {
660
+ "epoch": 56.95104895104895,
661
+ "eval_cer": 0.35524529765367463,
662
+ "eval_loss": 1.5134129524230957,
663
+ "eval_runtime": 6.0512,
664
+ "eval_samples_per_second": 47.098,
665
+ "eval_steps_per_second": 5.949,
666
+ "step": 4100
667
+ },
668
+ {
669
+ "epoch": 58.33566433566433,
670
+ "grad_norm": 1.582381248474121,
671
+ "learning_rate": 0.00012454285714285713,
672
+ "loss": 0.3136,
673
+ "step": 4200
674
+ },
675
+ {
676
+ "epoch": 58.33566433566433,
677
+ "eval_cer": 0.35873569904983515,
678
+ "eval_loss": 1.4695625305175781,
679
+ "eval_runtime": 6.0851,
680
+ "eval_samples_per_second": 46.836,
681
+ "eval_steps_per_second": 5.916,
682
+ "step": 4200
683
+ },
684
+ {
685
+ "epoch": 59.72727272727273,
686
+ "grad_norm": 0.9694690108299255,
687
+ "learning_rate": 0.00012025714285714285,
688
+ "loss": 0.3009,
689
+ "step": 4300
690
+ },
691
+ {
692
+ "epoch": 59.72727272727273,
693
+ "eval_cer": 0.3554392088423502,
694
+ "eval_loss": 1.432596206665039,
695
+ "eval_runtime": 6.0669,
696
+ "eval_samples_per_second": 46.976,
697
+ "eval_steps_per_second": 5.934,
698
+ "step": 4300
699
+ },
700
+ {
701
+ "epoch": 61.11188811188811,
702
+ "grad_norm": 1.6826478242874146,
703
+ "learning_rate": 0.00011597142857142855,
704
+ "loss": 0.2764,
705
+ "step": 4400
706
+ },
707
+ {
708
+ "epoch": 61.11188811188811,
709
+ "eval_cer": 0.35718440954043046,
710
+ "eval_loss": 1.4485613107681274,
711
+ "eval_runtime": 6.0638,
712
+ "eval_samples_per_second": 47.0,
713
+ "eval_steps_per_second": 5.937,
714
+ "step": 4400
715
+ },
716
+ {
717
+ "epoch": 62.50349650349651,
718
+ "grad_norm": 1.2600972652435303,
719
+ "learning_rate": 0.00011168571428571427,
720
+ "loss": 0.2738,
721
+ "step": 4500
722
+ },
723
+ {
724
+ "epoch": 62.50349650349651,
725
+ "eval_cer": 0.3593174326158619,
726
+ "eval_loss": 1.446284294128418,
727
+ "eval_runtime": 6.0798,
728
+ "eval_samples_per_second": 46.876,
729
+ "eval_steps_per_second": 5.921,
730
+ "step": 4500
731
+ },
732
+ {
733
+ "epoch": 63.89510489510489,
734
+ "grad_norm": 2.8400301933288574,
735
+ "learning_rate": 0.00010739999999999998,
736
+ "loss": 0.2574,
737
+ "step": 4600
738
+ },
739
+ {
740
+ "epoch": 63.89510489510489,
741
+ "eval_cer": 0.358347876672484,
742
+ "eval_loss": 1.4303468465805054,
743
+ "eval_runtime": 6.1117,
744
+ "eval_samples_per_second": 46.632,
745
+ "eval_steps_per_second": 5.89,
746
+ "step": 4600
747
+ },
748
+ {
749
+ "epoch": 65.27972027972028,
750
+ "grad_norm": 1.22808039188385,
751
+ "learning_rate": 0.0001031142857142857,
752
+ "loss": 0.2397,
753
+ "step": 4700
754
+ },
755
+ {
756
+ "epoch": 65.27972027972028,
757
+ "eval_cer": 0.3445801822765174,
758
+ "eval_loss": 1.4538311958312988,
759
+ "eval_runtime": 6.0831,
760
+ "eval_samples_per_second": 46.851,
761
+ "eval_steps_per_second": 5.918,
762
+ "step": 4700
763
+ },
764
+ {
765
+ "epoch": 66.67132867132867,
766
+ "grad_norm": 1.9458190202713013,
767
+ "learning_rate": 9.882857142857141e-05,
768
+ "loss": 0.2474,
769
+ "step": 4800
770
+ },
771
+ {
772
+ "epoch": 66.67132867132867,
773
+ "eval_cer": 0.3496218731820826,
774
+ "eval_loss": 1.4416499137878418,
775
+ "eval_runtime": 6.1066,
776
+ "eval_samples_per_second": 46.671,
777
+ "eval_steps_per_second": 5.895,
778
+ "step": 4800
779
+ },
780
+ {
781
+ "epoch": 68.05594405594405,
782
+ "grad_norm": 1.5397316217422485,
783
+ "learning_rate": 9.454285714285714e-05,
784
+ "loss": 0.2212,
785
+ "step": 4900
786
+ },
787
+ {
788
+ "epoch": 68.05594405594405,
789
+ "eval_cer": 0.34477409346519294,
790
+ "eval_loss": 1.476562738418579,
791
+ "eval_runtime": 6.125,
792
+ "eval_samples_per_second": 46.53,
793
+ "eval_steps_per_second": 5.878,
794
+ "step": 4900
795
+ },
796
+ {
797
+ "epoch": 69.44755244755245,
798
+ "grad_norm": 4.152817249298096,
799
+ "learning_rate": 9.025714285714285e-05,
800
+ "loss": 0.2173,
801
+ "step": 5000
802
+ },
803
+ {
804
+ "epoch": 69.44755244755245,
805
+ "eval_cer": 0.3496218731820826,
806
+ "eval_loss": 1.4784653186798096,
807
+ "eval_runtime": 6.0989,
808
+ "eval_samples_per_second": 46.73,
809
+ "eval_steps_per_second": 5.903,
810
+ "step": 5000
811
+ },
812
+ {
813
+ "epoch": 70.83916083916084,
814
+ "grad_norm": 1.8647962808609009,
815
+ "learning_rate": 8.597142857142857e-05,
816
+ "loss": 0.2138,
817
+ "step": 5100
818
+ },
819
+ {
820
+ "epoch": 70.83916083916084,
821
+ "eval_cer": 0.3581539654838084,
822
+ "eval_loss": 1.4859139919281006,
823
+ "eval_runtime": 6.0911,
824
+ "eval_samples_per_second": 46.79,
825
+ "eval_steps_per_second": 5.91,
826
+ "step": 5100
827
+ },
828
+ {
829
+ "epoch": 72.22377622377623,
830
+ "grad_norm": 6.688198089599609,
831
+ "learning_rate": 8.168571428571428e-05,
832
+ "loss": 0.2037,
833
+ "step": 5200
834
+ },
835
+ {
836
+ "epoch": 72.22377622377623,
837
+ "eval_cer": 0.3500096955594338,
838
+ "eval_loss": 1.5022231340408325,
839
+ "eval_runtime": 6.1063,
840
+ "eval_samples_per_second": 46.673,
841
+ "eval_steps_per_second": 5.896,
842
+ "step": 5200
843
+ },
844
+ {
845
+ "epoch": 73.61538461538461,
846
+ "grad_norm": 2.6784660816192627,
847
+ "learning_rate": 7.74e-05,
848
+ "loss": 0.194,
849
+ "step": 5300
850
+ },
851
+ {
852
+ "epoch": 73.61538461538461,
853
+ "eval_cer": 0.34904013961605584,
854
+ "eval_loss": 1.4964337348937988,
855
+ "eval_runtime": 6.0868,
856
+ "eval_samples_per_second": 46.823,
857
+ "eval_steps_per_second": 5.914,
858
+ "step": 5300
859
+ },
860
+ {
861
+ "epoch": 75.0,
862
+ "grad_norm": 2.1717565059661865,
863
+ "learning_rate": 7.31142857142857e-05,
864
+ "loss": 0.1758,
865
+ "step": 5400
866
+ },
867
+ {
868
+ "epoch": 75.0,
869
+ "eval_cer": 0.35524529765367463,
870
+ "eval_loss": 1.5645275115966797,
871
+ "eval_runtime": 6.0591,
872
+ "eval_samples_per_second": 47.037,
873
+ "eval_steps_per_second": 5.941,
874
+ "step": 5400
875
+ },
876
+ {
877
+ "epoch": 76.39160839160839,
878
+ "grad_norm": 3.3418147563934326,
879
+ "learning_rate": 6.882857142857142e-05,
880
+ "loss": 0.1693,
881
+ "step": 5500
882
+ },
883
+ {
884
+ "epoch": 76.39160839160839,
885
+ "eval_cer": 0.3492340508047314,
886
+ "eval_loss": 1.5214943885803223,
887
+ "eval_runtime": 6.0965,
888
+ "eval_samples_per_second": 46.748,
889
+ "eval_steps_per_second": 5.905,
890
+ "step": 5500
891
+ },
892
+ {
893
+ "epoch": 77.78321678321679,
894
+ "grad_norm": 1.1721317768096924,
895
+ "learning_rate": 6.454285714285713e-05,
896
+ "loss": 0.1682,
897
+ "step": 5600
898
+ },
899
+ {
900
+ "epoch": 77.78321678321679,
901
+ "eval_cer": 0.34361062633313944,
902
+ "eval_loss": 1.557164192199707,
903
+ "eval_runtime": 6.0786,
904
+ "eval_samples_per_second": 46.886,
905
+ "eval_steps_per_second": 5.922,
906
+ "step": 5600
907
+ },
908
+ {
909
+ "epoch": 79.16783216783217,
910
+ "grad_norm": 1.2348577976226807,
911
+ "learning_rate": 6.0257142857142846e-05,
912
+ "loss": 0.1616,
913
+ "step": 5700
914
+ },
915
+ {
916
+ "epoch": 79.16783216783217,
917
+ "eval_cer": 0.3461314717859221,
918
+ "eval_loss": 1.497072696685791,
919
+ "eval_runtime": 6.0713,
920
+ "eval_samples_per_second": 46.942,
921
+ "eval_steps_per_second": 5.93,
922
+ "step": 5700
923
+ },
924
+ {
925
+ "epoch": 80.55944055944056,
926
+ "grad_norm": 1.101247787475586,
927
+ "learning_rate": 5.597142857142857e-05,
928
+ "loss": 0.1625,
929
+ "step": 5800
930
+ },
931
+ {
932
+ "epoch": 80.55944055944056,
933
+ "eval_cer": 0.3515609850688385,
934
+ "eval_loss": 1.5326788425445557,
935
+ "eval_runtime": 6.0886,
936
+ "eval_samples_per_second": 46.809,
937
+ "eval_steps_per_second": 5.913,
938
+ "step": 5800
939
+ },
940
+ {
941
+ "epoch": 81.95104895104895,
942
+ "grad_norm": 4.953105449676514,
943
+ "learning_rate": 5.168571428571428e-05,
944
+ "loss": 0.1432,
945
+ "step": 5900
946
+ },
947
+ {
948
+ "epoch": 81.95104895104895,
949
+ "eval_cer": 0.35059142912546054,
950
+ "eval_loss": 1.5595433712005615,
951
+ "eval_runtime": 6.078,
952
+ "eval_samples_per_second": 46.89,
953
+ "eval_steps_per_second": 5.923,
954
+ "step": 5900
955
+ },
956
+ {
957
+ "epoch": 83.33566433566433,
958
+ "grad_norm": 1.2514948844909668,
959
+ "learning_rate": 4.7399999999999993e-05,
960
+ "loss": 0.1348,
961
+ "step": 6000
962
+ },
963
+ {
964
+ "epoch": 83.33566433566433,
965
+ "eval_cer": 0.3482644948613535,
966
+ "eval_loss": 1.556195616722107,
967
+ "eval_runtime": 6.0756,
968
+ "eval_samples_per_second": 46.909,
969
+ "eval_steps_per_second": 5.925,
970
+ "step": 6000
971
+ },
972
+ {
973
+ "epoch": 84.72727272727273,
974
+ "grad_norm": 0.5957698225975037,
975
+ "learning_rate": 4.311428571428571e-05,
976
+ "loss": 0.137,
977
+ "step": 6100
978
+ },
979
+ {
980
+ "epoch": 84.72727272727273,
981
+ "eval_cer": 0.3484584060500291,
982
+ "eval_loss": 1.590181827545166,
983
+ "eval_runtime": 6.0761,
984
+ "eval_samples_per_second": 46.905,
985
+ "eval_steps_per_second": 5.925,
986
+ "step": 6100
987
+ },
988
+ {
989
+ "epoch": 86.1118881118881,
990
+ "grad_norm": 2.8553950786590576,
991
+ "learning_rate": 3.882857142857142e-05,
992
+ "loss": 0.1263,
993
+ "step": 6200
994
+ },
995
+ {
996
+ "epoch": 86.1118881118881,
997
+ "eval_cer": 0.35214271863486524,
998
+ "eval_loss": 1.5852645635604858,
999
+ "eval_runtime": 6.1244,
1000
+ "eval_samples_per_second": 46.535,
1001
+ "eval_steps_per_second": 5.878,
1002
+ "step": 6200
1003
+ },
1004
+ {
1005
+ "epoch": 87.5034965034965,
1006
+ "grad_norm": 0.7932090759277344,
1007
+ "learning_rate": 3.454285714285714e-05,
1008
+ "loss": 0.1271,
1009
+ "step": 6300
1010
+ },
1011
+ {
1012
+ "epoch": 87.5034965034965,
1013
+ "eval_cer": 0.3488462284273803,
1014
+ "eval_loss": 1.597654938697815,
1015
+ "eval_runtime": 6.1436,
1016
+ "eval_samples_per_second": 46.389,
1017
+ "eval_steps_per_second": 5.86,
1018
+ "step": 6300
1019
+ },
1020
+ {
1021
+ "epoch": 88.8951048951049,
1022
+ "grad_norm": 2.1244022846221924,
1023
+ "learning_rate": 3.0257142857142855e-05,
1024
+ "loss": 0.123,
1025
+ "step": 6400
1026
+ },
1027
+ {
1028
+ "epoch": 88.8951048951049,
1029
+ "eval_cer": 0.3498157843707582,
1030
+ "eval_loss": 1.6023805141448975,
1031
+ "eval_runtime": 6.0927,
1032
+ "eval_samples_per_second": 46.777,
1033
+ "eval_steps_per_second": 5.909,
1034
+ "step": 6400
1035
+ },
1036
+ {
1037
+ "epoch": 90.27972027972028,
1038
+ "grad_norm": 1.3532764911651611,
1039
+ "learning_rate": 2.5971428571428572e-05,
1040
+ "loss": 0.117,
1041
+ "step": 6500
1042
+ },
1043
+ {
1044
+ "epoch": 90.27972027972028,
1045
+ "eval_cer": 0.3535000969555943,
1046
+ "eval_loss": 1.6092747449874878,
1047
+ "eval_runtime": 6.1301,
1048
+ "eval_samples_per_second": 46.492,
1049
+ "eval_steps_per_second": 5.873,
1050
+ "step": 6500
1051
+ },
1052
+ {
1053
+ "epoch": 91.67132867132867,
1054
+ "grad_norm": 3.1814770698547363,
1055
+ "learning_rate": 2.1685714285714286e-05,
1056
+ "loss": 0.1077,
1057
+ "step": 6600
1058
+ },
1059
+ {
1060
+ "epoch": 91.67132867132867,
1061
+ "eval_cer": 0.35194880744618967,
1062
+ "eval_loss": 1.5807358026504517,
1063
+ "eval_runtime": 6.0891,
1064
+ "eval_samples_per_second": 46.805,
1065
+ "eval_steps_per_second": 5.912,
1066
+ "step": 6600
1067
+ },
1068
+ {
1069
+ "epoch": 93.05594405594405,
1070
+ "grad_norm": 1.0375070571899414,
1071
+ "learning_rate": 1.74e-05,
1072
+ "loss": 0.1072,
1073
+ "step": 6700
1074
+ },
1075
+ {
1076
+ "epoch": 93.05594405594405,
1077
+ "eval_cer": 0.3476827612953267,
1078
+ "eval_loss": 1.580068588256836,
1079
+ "eval_runtime": 6.1035,
1080
+ "eval_samples_per_second": 46.695,
1081
+ "eval_steps_per_second": 5.898,
1082
+ "step": 6700
1083
+ },
1084
+ {
1085
+ "epoch": 94.44755244755245,
1086
+ "grad_norm": 1.3363580703735352,
1087
+ "learning_rate": 1.3114285714285713e-05,
1088
+ "loss": 0.1063,
1089
+ "step": 6800
1090
+ },
1091
+ {
1092
+ "epoch": 94.44755244755245,
1093
+ "eval_cer": 0.35020360674810935,
1094
+ "eval_loss": 1.5893759727478027,
1095
+ "eval_runtime": 6.0883,
1096
+ "eval_samples_per_second": 46.811,
1097
+ "eval_steps_per_second": 5.913,
1098
+ "step": 6800
1099
+ },
1100
+ {
1101
+ "epoch": 95.83916083916084,
1102
+ "grad_norm": 2.259174108505249,
1103
+ "learning_rate": 8.828571428571429e-06,
1104
+ "loss": 0.103,
1105
+ "step": 6900
1106
+ },
1107
+ {
1108
+ "epoch": 95.83916083916084,
1109
+ "eval_cer": 0.3498157843707582,
1110
+ "eval_loss": 1.602723240852356,
1111
+ "eval_runtime": 6.0922,
1112
+ "eval_samples_per_second": 46.781,
1113
+ "eval_steps_per_second": 5.909,
1114
+ "step": 6900
1115
+ },
1116
+ {
1117
+ "epoch": 97.22377622377623,
1118
+ "grad_norm": 1.8573235273361206,
1119
+ "learning_rate": 4.585714285714285e-06,
1120
+ "loss": 0.1032,
1121
+ "step": 7000
1122
+ },
1123
+ {
1124
+ "epoch": 97.22377622377623,
1125
+ "eval_cer": 0.3484584060500291,
1126
+ "eval_loss": 1.6034408807754517,
1127
+ "eval_runtime": 6.1003,
1128
+ "eval_samples_per_second": 46.719,
1129
+ "eval_steps_per_second": 5.901,
1130
+ "step": 7000
1131
+ },
1132
+ {
1133
+ "epoch": 98.61538461538461,
1134
+ "grad_norm": 1.2302757501602173,
1135
+ "learning_rate": 3e-07,
1136
+ "loss": 0.0971,
1137
+ "step": 7100
1138
+ },
1139
+ {
1140
+ "epoch": 98.61538461538461,
1141
+ "eval_cer": 0.3480705836726779,
1142
+ "eval_loss": 1.6103968620300293,
1143
+ "eval_runtime": 6.1158,
1144
+ "eval_samples_per_second": 46.6,
1145
+ "eval_steps_per_second": 5.886,
1146
+ "step": 7100
1147
+ },
1148
+ {
1149
+ "epoch": 98.61538461538461,
1150
+ "step": 7100,
1151
+ "total_flos": 1.5637471182402683e+19,
1152
+ "train_loss": 1.063481583259475,
1153
+ "train_runtime": 9766.9628,
1154
+ "train_samples_per_second": 23.354,
1155
+ "train_steps_per_second": 0.727
1156
+ },
1157
+ {
1158
+ "epoch": 98.61538461538461,
1159
+ "eval_cer": 0.35214271863486524,
1160
+ "eval_loss": 1.6103789806365967,
1161
+ "eval_runtime": 6.0538,
1162
+ "eval_samples_per_second": 47.078,
1163
+ "eval_steps_per_second": 5.947,
1164
+ "step": 7100
1165
  }
1166
  ],
1167
  "logging_steps": 100,
1168
+ "max_steps": 7100,
1169
  "num_input_tokens_seen": 0,
1170
+ "num_train_epochs": 100,
1171
  "save_steps": 100,
1172
  "stateful_callbacks": {
1173
  "TrainerControl": {
 
1181
  "attributes": {}
1182
  }
1183
  },
1184
+ "total_flos": 1.5637471182402683e+19,
1185
  "train_batch_size": 16,
1186
  "trial_name": null,
1187
  "trial_params": null