rbcurzon commited on
Commit
d770947
·
verified ·
1 Parent(s): cce268f

End of training

Browse files
README.md CHANGED
@@ -4,11 +4,24 @@ license: apache-2.0
4
  base_model: openai/whisper-small
5
  tags:
6
  - generated_from_trainer
 
 
7
  metrics:
8
  - wer
9
  model-index:
10
  - name: whisper-medium-test
11
- results: []
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -16,7 +29,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # whisper-medium-test
18
 
19
- This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.4044
22
  - Wer: 0.1675
 
4
  base_model: openai/whisper-small
5
  tags:
6
  - generated_from_trainer
7
+ datasets:
8
+ - rbcurzon/ph_dialect_asr
9
  metrics:
10
  - wer
11
  model-index:
12
  - name: whisper-medium-test
13
+ results:
14
+ - task:
15
+ name: Automatic Speech Recognition
16
+ type: automatic-speech-recognition
17
+ dataset:
18
+ name: rbcurzon/ph_dialect_asr all
19
+ type: rbcurzon/ph_dialect_asr
20
+ args: all
21
+ metrics:
22
+ - name: Wer
23
+ type: wer
24
+ value: 0.1675184165997231
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
29
 
30
  # whisper-medium-test
31
 
32
+ This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the rbcurzon/ph_dialect_asr all dataset.
33
  It achieves the following results on the evaluation set:
34
  - Loss: 0.4044
35
  - Wer: 0.1675
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 4.934979423868313,
3
- "eval_loss": 0.2823709547519684,
4
- "eval_runtime": 1581.903,
5
- "eval_samples": 2619,
6
- "eval_samples_per_second": 1.656,
7
- "eval_steps_per_second": 0.414,
8
- "eval_wer": 0.12251221112488153,
9
- "total_flos": 4.894412894502912e+19,
10
- "train_loss": 0.1704294042487939,
11
- "train_runtime": 13123.9045,
12
- "train_samples": 9717,
13
- "train_samples_per_second": 3.657,
14
- "train_steps_per_second": 0.229
15
  }
 
1
  {
2
+ "epoch": 14.79466271312083,
3
+ "eval_loss": 0.4044080376625061,
4
+ "eval_runtime": 942.1523,
5
+ "eval_samples": 2885,
6
+ "eval_samples_per_second": 3.062,
7
+ "eval_steps_per_second": 0.383,
8
+ "eval_wer": 0.1675184165997231,
9
+ "total_flos": 4.607669973179538e+19,
10
+ "train_loss": 0.11535389684215187,
11
+ "train_runtime": 25285.0125,
12
+ "train_samples": 10787,
13
+ "train_samples_per_second": 6.328,
14
+ "train_steps_per_second": 0.198
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.934979423868313,
3
- "eval_loss": 0.2823709547519684,
4
- "eval_runtime": 1581.903,
5
- "eval_samples": 2619,
6
- "eval_samples_per_second": 1.656,
7
- "eval_steps_per_second": 0.414,
8
- "eval_wer": 0.12251221112488153
9
  }
 
1
  {
2
+ "epoch": 14.79466271312083,
3
+ "eval_loss": 0.4044080376625061,
4
+ "eval_runtime": 942.1523,
5
+ "eval_samples": 2885,
6
+ "eval_samples_per_second": 3.062,
7
+ "eval_steps_per_second": 0.383,
8
+ "eval_wer": 0.1675184165997231
9
  }
runs/Aug06_03-27-27_4853604300ab/events.out.tfevents.1754477125.4853604300ab.1267.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed7322a5f0a14fed47a4d413f46ab0f6368b93ab016914b91f347c32ec3bc1f
3
+ size 406
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.934979423868313,
3
- "total_flos": 4.894412894502912e+19,
4
- "train_loss": 0.1704294042487939,
5
- "train_runtime": 13123.9045,
6
- "train_samples": 9717,
7
- "train_samples_per_second": 3.657,
8
- "train_steps_per_second": 0.229
9
  }
 
1
  {
2
+ "epoch": 14.79466271312083,
3
+ "total_flos": 4.607669973179538e+19,
4
+ "train_loss": 0.11535389684215187,
5
+ "train_runtime": 25285.0125,
6
+ "train_samples": 10787,
7
+ "train_samples_per_second": 6.328,
8
+ "train_steps_per_second": 0.198
9
  }
trainer_state.json CHANGED
@@ -2,894 +2,1472 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.934979423868313,
6
  "eval_steps": 1000,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.0411522633744856,
14
- "grad_norm": 14.940086364746094,
15
- "learning_rate": 4.6000000000000004e-07,
16
- "loss": 1.357,
17
  "step": 25
18
  },
19
  {
20
- "epoch": 0.0823045267489712,
21
- "grad_norm": 9.376535415649414,
22
- "learning_rate": 9.600000000000001e-07,
23
- "loss": 1.124,
24
  "step": 50
25
  },
26
  {
27
- "epoch": 0.12345679012345678,
28
- "grad_norm": 8.351006507873535,
29
  "learning_rate": 1.44e-06,
30
- "loss": 0.9374,
31
  "step": 75
32
  },
33
  {
34
- "epoch": 0.1646090534979424,
35
- "grad_norm": 9.542637825012207,
36
  "learning_rate": 1.94e-06,
37
- "loss": 0.7622,
38
  "step": 100
39
  },
40
  {
41
- "epoch": 0.205761316872428,
42
- "grad_norm": 9.52396297454834,
43
  "learning_rate": 2.4400000000000004e-06,
44
- "loss": 0.7141,
45
  "step": 125
46
  },
47
  {
48
- "epoch": 0.24691358024691357,
49
- "grad_norm": 7.540445804595947,
50
  "learning_rate": 2.9400000000000002e-06,
51
- "loss": 0.6652,
52
  "step": 150
53
  },
54
  {
55
- "epoch": 0.2880658436213992,
56
- "grad_norm": 7.401683807373047,
57
  "learning_rate": 3.44e-06,
58
- "loss": 0.5939,
59
  "step": 175
60
  },
61
  {
62
- "epoch": 0.3292181069958848,
63
- "grad_norm": 9.240742683410645,
64
  "learning_rate": 3.94e-06,
65
- "loss": 0.5542,
66
  "step": 200
67
  },
68
  {
69
- "epoch": 0.37037037037037035,
70
- "grad_norm": 7.216451644897461,
71
  "learning_rate": 4.440000000000001e-06,
72
- "loss": 0.544,
73
  "step": 225
74
  },
75
  {
76
- "epoch": 0.411522633744856,
77
- "grad_norm": 7.483836650848389,
78
  "learning_rate": 4.94e-06,
79
- "loss": 0.5081,
80
  "step": 250
81
  },
82
  {
83
- "epoch": 0.45267489711934156,
84
- "grad_norm": 6.959393501281738,
85
  "learning_rate": 5.4400000000000004e-06,
86
- "loss": 0.4608,
87
  "step": 275
88
  },
89
  {
90
- "epoch": 0.49382716049382713,
91
- "grad_norm": 5.851141452789307,
92
  "learning_rate": 5.94e-06,
93
- "loss": 0.4661,
94
  "step": 300
95
  },
96
  {
97
- "epoch": 0.5349794238683128,
98
- "grad_norm": 6.5286712646484375,
99
  "learning_rate": 6.440000000000001e-06,
100
- "loss": 0.4301,
101
  "step": 325
102
  },
103
  {
104
- "epoch": 0.5761316872427984,
105
- "grad_norm": 7.329245567321777,
106
  "learning_rate": 6.9400000000000005e-06,
107
- "loss": 0.4293,
108
  "step": 350
109
  },
110
  {
111
- "epoch": 0.6172839506172839,
112
- "grad_norm": 7.156967639923096,
113
  "learning_rate": 7.440000000000001e-06,
114
- "loss": 0.4247,
115
  "step": 375
116
  },
117
  {
118
- "epoch": 0.6584362139917695,
119
- "grad_norm": 6.780092716217041,
120
  "learning_rate": 7.94e-06,
121
- "loss": 0.4084,
122
  "step": 400
123
  },
124
  {
125
- "epoch": 0.6995884773662552,
126
- "grad_norm": 5.79763126373291,
127
  "learning_rate": 8.44e-06,
128
- "loss": 0.3924,
129
  "step": 425
130
  },
131
  {
132
- "epoch": 0.7407407407407407,
133
- "grad_norm": 6.162405014038086,
134
  "learning_rate": 8.94e-06,
135
- "loss": 0.3641,
136
  "step": 450
137
  },
138
  {
139
- "epoch": 0.7818930041152263,
140
- "grad_norm": 6.669307231903076,
141
  "learning_rate": 9.440000000000001e-06,
142
- "loss": 0.3631,
143
  "step": 475
144
  },
145
  {
146
- "epoch": 0.823045267489712,
147
- "grad_norm": 5.827241897583008,
148
  "learning_rate": 9.940000000000001e-06,
149
- "loss": 0.3607,
150
  "step": 500
151
  },
152
  {
153
- "epoch": 0.8641975308641975,
154
- "grad_norm": 7.120790004730225,
155
- "learning_rate": 9.912000000000001e-06,
156
- "loss": 0.3396,
157
  "step": 525
158
  },
159
  {
160
- "epoch": 0.9053497942386831,
161
- "grad_norm": 5.817197322845459,
162
- "learning_rate": 9.812e-06,
163
- "loss": 0.3416,
164
  "step": 550
165
  },
166
  {
167
- "epoch": 0.9465020576131687,
168
- "grad_norm": 6.041031837463379,
169
- "learning_rate": 9.712e-06,
170
- "loss": 0.3415,
171
  "step": 575
172
  },
173
  {
174
- "epoch": 0.9876543209876543,
175
- "grad_norm": 5.69822883605957,
176
- "learning_rate": 9.612000000000002e-06,
177
- "loss": 0.3304,
178
  "step": 600
179
  },
180
  {
181
- "epoch": 1.0279835390946501,
182
- "grad_norm": 5.423857688903809,
183
- "learning_rate": 9.512000000000001e-06,
184
- "loss": 0.2518,
185
  "step": 625
186
  },
187
  {
188
- "epoch": 1.0691358024691358,
189
- "grad_norm": 4.724055767059326,
190
- "learning_rate": 9.412e-06,
191
- "loss": 0.2157,
192
  "step": 650
193
  },
194
  {
195
- "epoch": 1.1102880658436214,
196
- "grad_norm": 4.128566265106201,
197
- "learning_rate": 9.312000000000002e-06,
198
- "loss": 0.2118,
199
  "step": 675
200
  },
201
  {
202
- "epoch": 1.151440329218107,
203
- "grad_norm": 4.85585880279541,
204
- "learning_rate": 9.212000000000001e-06,
205
- "loss": 0.2327,
206
  "step": 700
207
  },
208
  {
209
- "epoch": 1.1925925925925926,
210
- "grad_norm": 5.799758434295654,
211
- "learning_rate": 9.112e-06,
212
- "loss": 0.2041,
213
  "step": 725
214
  },
215
  {
216
- "epoch": 1.2337448559670783,
217
- "grad_norm": 3.91048002243042,
218
- "learning_rate": 9.012000000000001e-06,
219
- "loss": 0.1966,
220
  "step": 750
221
  },
222
  {
223
- "epoch": 1.274897119341564,
224
- "grad_norm": 5.111474990844727,
225
- "learning_rate": 8.912e-06,
226
- "loss": 0.2009,
227
  "step": 775
228
  },
229
  {
230
- "epoch": 1.3160493827160493,
231
- "grad_norm": 5.129931449890137,
232
- "learning_rate": 8.812000000000001e-06,
233
- "loss": 0.177,
234
  "step": 800
235
  },
236
  {
237
- "epoch": 1.357201646090535,
238
- "grad_norm": 3.5538148880004883,
239
- "learning_rate": 8.712e-06,
240
- "loss": 0.1988,
241
  "step": 825
242
  },
243
  {
244
- "epoch": 1.3983539094650206,
245
- "grad_norm": 4.522074222564697,
246
- "learning_rate": 8.612e-06,
247
- "loss": 0.1998,
248
  "step": 850
249
  },
250
  {
251
- "epoch": 1.4395061728395062,
252
- "grad_norm": 4.1635637283325195,
253
- "learning_rate": 8.512e-06,
254
- "loss": 0.1815,
255
  "step": 875
256
  },
257
  {
258
- "epoch": 1.4806584362139918,
259
- "grad_norm": 4.162914752960205,
260
- "learning_rate": 8.412e-06,
261
- "loss": 0.181,
262
  "step": 900
263
  },
264
  {
265
- "epoch": 1.5218106995884773,
266
- "grad_norm": 5.481962203979492,
267
- "learning_rate": 8.312000000000001e-06,
268
- "loss": 0.1817,
269
  "step": 925
270
  },
271
  {
272
- "epoch": 1.5629629629629629,
273
- "grad_norm": 4.371176242828369,
274
- "learning_rate": 8.212e-06,
275
- "loss": 0.1855,
276
  "step": 950
277
  },
278
  {
279
- "epoch": 1.6041152263374485,
280
- "grad_norm": 4.259627819061279,
281
- "learning_rate": 8.112000000000001e-06,
282
- "loss": 0.1768,
283
  "step": 975
284
  },
285
  {
286
- "epoch": 1.6452674897119342,
287
- "grad_norm": 4.984127044677734,
288
- "learning_rate": 8.012e-06,
289
- "loss": 0.1981,
290
  "step": 1000
291
  },
292
  {
293
- "epoch": 1.6452674897119342,
294
- "eval_loss": 0.2714756429195404,
295
- "eval_runtime": 1557.3098,
296
- "eval_samples_per_second": 1.682,
297
- "eval_steps_per_second": 0.421,
298
- "eval_wer": 0.14547641612597506,
299
  "step": 1000
300
  },
301
  {
302
- "epoch": 1.6864197530864198,
303
- "grad_norm": 3.931295156478882,
304
- "learning_rate": 7.912000000000001e-06,
305
- "loss": 0.1774,
306
  "step": 1025
307
  },
308
  {
309
- "epoch": 1.7275720164609054,
310
- "grad_norm": 4.057419300079346,
311
- "learning_rate": 7.812e-06,
312
- "loss": 0.1789,
313
  "step": 1050
314
  },
315
  {
316
- "epoch": 1.768724279835391,
317
- "grad_norm": 3.723196029663086,
318
- "learning_rate": 7.712e-06,
319
- "loss": 0.1703,
320
  "step": 1075
321
  },
322
  {
323
- "epoch": 1.8098765432098767,
324
- "grad_norm": 5.39990758895874,
325
- "learning_rate": 7.612e-06,
326
- "loss": 0.1815,
327
  "step": 1100
328
  },
329
  {
330
- "epoch": 1.851028806584362,
331
- "grad_norm": 4.207727909088135,
332
- "learning_rate": 7.512e-06,
333
- "loss": 0.1561,
334
  "step": 1125
335
  },
336
  {
337
- "epoch": 1.8921810699588477,
338
- "grad_norm": 5.286857604980469,
339
- "learning_rate": 7.412e-06,
340
- "loss": 0.1848,
341
  "step": 1150
342
  },
343
  {
344
- "epoch": 1.9333333333333333,
345
- "grad_norm": 3.916595697402954,
346
- "learning_rate": 7.3120000000000005e-06,
347
- "loss": 0.1673,
348
  "step": 1175
349
  },
350
  {
351
- "epoch": 1.974485596707819,
352
- "grad_norm": 5.331394195556641,
353
- "learning_rate": 7.212e-06,
354
- "loss": 0.1939,
355
  "step": 1200
356
  },
357
  {
358
- "epoch": 2.0148148148148146,
359
- "grad_norm": 2.4200124740600586,
360
- "learning_rate": 7.1120000000000015e-06,
361
- "loss": 0.1235,
362
  "step": 1225
363
  },
364
  {
365
- "epoch": 2.0559670781893002,
366
- "grad_norm": 6.067932605743408,
367
- "learning_rate": 7.012000000000001e-06,
368
- "loss": 0.0826,
369
  "step": 1250
370
  },
371
  {
372
- "epoch": 2.097119341563786,
373
- "grad_norm": 3.729630708694458,
374
- "learning_rate": 6.912000000000001e-06,
375
- "loss": 0.0721,
376
  "step": 1275
377
  },
378
  {
379
- "epoch": 2.1382716049382715,
380
- "grad_norm": 2.2330338954925537,
381
- "learning_rate": 6.812000000000001e-06,
382
- "loss": 0.0712,
383
  "step": 1300
384
  },
385
  {
386
- "epoch": 2.179423868312757,
387
- "grad_norm": 2.977776288986206,
388
- "learning_rate": 6.712000000000001e-06,
389
- "loss": 0.069,
390
  "step": 1325
391
  },
392
  {
393
- "epoch": 2.2205761316872428,
394
- "grad_norm": 3.0866611003875732,
395
- "learning_rate": 6.612e-06,
396
- "loss": 0.077,
397
  "step": 1350
398
  },
399
  {
400
- "epoch": 2.2617283950617284,
401
- "grad_norm": 1.9583996534347534,
402
- "learning_rate": 6.5120000000000005e-06,
403
- "loss": 0.0681,
404
  "step": 1375
405
  },
406
  {
407
- "epoch": 2.302880658436214,
408
- "grad_norm": 3.112598419189453,
409
- "learning_rate": 6.412000000000001e-06,
410
- "loss": 0.0766,
411
  "step": 1400
412
  },
413
  {
414
- "epoch": 2.3440329218106997,
415
- "grad_norm": 3.2026150226593018,
416
- "learning_rate": 6.312000000000001e-06,
417
- "loss": 0.0776,
418
  "step": 1425
419
  },
420
  {
421
- "epoch": 2.3851851851851853,
422
- "grad_norm": 3.7830779552459717,
423
- "learning_rate": 6.212e-06,
424
- "loss": 0.0763,
425
  "step": 1450
426
  },
427
  {
428
- "epoch": 2.426337448559671,
429
- "grad_norm": 1.5785398483276367,
430
- "learning_rate": 6.112e-06,
431
- "loss": 0.0666,
432
  "step": 1475
433
  },
434
  {
435
- "epoch": 2.4674897119341566,
436
- "grad_norm": 3.071021795272827,
437
- "learning_rate": 6.012e-06,
438
- "loss": 0.074,
439
  "step": 1500
440
  },
441
  {
442
- "epoch": 2.5086419753086417,
443
- "grad_norm": 2.2543785572052,
444
- "learning_rate": 5.912e-06,
445
- "loss": 0.0711,
446
  "step": 1525
447
  },
448
  {
449
- "epoch": 2.549794238683128,
450
- "grad_norm": 2.8244235515594482,
451
- "learning_rate": 5.812000000000001e-06,
452
- "loss": 0.068,
453
  "step": 1550
454
  },
455
  {
456
- "epoch": 2.590946502057613,
457
- "grad_norm": 1.933445930480957,
458
- "learning_rate": 5.7120000000000005e-06,
459
- "loss": 0.0797,
460
  "step": 1575
461
  },
462
  {
463
- "epoch": 2.6320987654320986,
464
- "grad_norm": 2.2213869094848633,
465
- "learning_rate": 5.612000000000001e-06,
466
- "loss": 0.072,
467
  "step": 1600
468
  },
469
  {
470
- "epoch": 2.6732510288065843,
471
- "grad_norm": 3.0998716354370117,
472
- "learning_rate": 5.512000000000001e-06,
473
- "loss": 0.0638,
474
  "step": 1625
475
  },
476
  {
477
- "epoch": 2.71440329218107,
478
- "grad_norm": 3.967010974884033,
479
- "learning_rate": 5.412000000000001e-06,
480
- "loss": 0.0728,
481
  "step": 1650
482
  },
483
  {
484
- "epoch": 2.7555555555555555,
485
- "grad_norm": 1.7496695518493652,
486
- "learning_rate": 5.312e-06,
487
- "loss": 0.0677,
488
  "step": 1675
489
  },
490
  {
491
- "epoch": 2.796707818930041,
492
- "grad_norm": 3.4264888763427734,
493
- "learning_rate": 5.212e-06,
494
- "loss": 0.0711,
495
  "step": 1700
496
  },
497
  {
498
- "epoch": 2.837860082304527,
499
- "grad_norm": 3.8687572479248047,
500
- "learning_rate": 5.112e-06,
501
- "loss": 0.0694,
502
  "step": 1725
503
  },
504
  {
505
- "epoch": 2.8790123456790124,
506
- "grad_norm": 2.948435068130493,
507
- "learning_rate": 5.0120000000000005e-06,
508
- "loss": 0.0734,
509
  "step": 1750
510
  },
511
  {
512
- "epoch": 2.920164609053498,
513
- "grad_norm": 2.5124669075012207,
514
- "learning_rate": 4.9120000000000006e-06,
515
- "loss": 0.0789,
516
  "step": 1775
517
  },
518
  {
519
- "epoch": 2.9613168724279837,
520
- "grad_norm": 2.759510040283203,
521
- "learning_rate": 4.812000000000001e-06,
522
- "loss": 0.0616,
523
  "step": 1800
524
  },
525
  {
526
- "epoch": 3.0016460905349795,
527
- "grad_norm": 1.3032381534576416,
528
- "learning_rate": 4.712000000000001e-06,
529
- "loss": 0.0516,
530
  "step": 1825
531
  },
532
  {
533
- "epoch": 3.042798353909465,
534
- "grad_norm": 1.752594232559204,
535
- "learning_rate": 4.612e-06,
536
- "loss": 0.0232,
537
  "step": 1850
538
  },
539
  {
540
- "epoch": 3.083950617283951,
541
- "grad_norm": 1.5227144956588745,
542
- "learning_rate": 4.512e-06,
543
- "loss": 0.0295,
544
  "step": 1875
545
  },
546
  {
547
- "epoch": 3.125102880658436,
548
- "grad_norm": 1.635178565979004,
549
- "learning_rate": 4.412e-06,
550
- "loss": 0.0244,
551
  "step": 1900
552
  },
553
  {
554
- "epoch": 3.1662551440329216,
555
- "grad_norm": 1.666857123374939,
556
- "learning_rate": 4.312e-06,
557
- "loss": 0.0232,
558
  "step": 1925
559
  },
560
  {
561
- "epoch": 3.2074074074074073,
562
- "grad_norm": 1.3419064283370972,
563
- "learning_rate": 4.2120000000000005e-06,
564
- "loss": 0.0245,
565
  "step": 1950
566
  },
567
  {
568
- "epoch": 3.248559670781893,
569
- "grad_norm": 1.2259336709976196,
570
- "learning_rate": 4.112000000000001e-06,
571
- "loss": 0.0286,
572
  "step": 1975
573
  },
574
  {
575
- "epoch": 3.2897119341563785,
576
- "grad_norm": 1.062391757965088,
577
- "learning_rate": 4.012000000000001e-06,
578
- "loss": 0.0236,
579
  "step": 2000
580
  },
581
  {
582
- "epoch": 3.2897119341563785,
583
- "eval_loss": 0.27091845870018005,
584
- "eval_runtime": 1588.6268,
585
- "eval_samples_per_second": 1.649,
586
- "eval_steps_per_second": 0.412,
587
- "eval_wer": 0.12672231537508202,
588
  "step": 2000
589
  },
590
  {
591
- "epoch": 3.330864197530864,
592
- "grad_norm": 2.021160840988159,
593
- "learning_rate": 3.912e-06,
594
- "loss": 0.0256,
595
  "step": 2025
596
  },
597
  {
598
- "epoch": 3.37201646090535,
599
- "grad_norm": 1.8684940338134766,
600
- "learning_rate": 3.812e-06,
601
- "loss": 0.026,
602
  "step": 2050
603
  },
604
  {
605
- "epoch": 3.4131687242798354,
606
- "grad_norm": 0.7061536908149719,
607
- "learning_rate": 3.712e-06,
608
- "loss": 0.0225,
609
  "step": 2075
610
  },
611
  {
612
- "epoch": 3.454320987654321,
613
- "grad_norm": 1.89629328250885,
614
- "learning_rate": 3.6120000000000003e-06,
615
- "loss": 0.0273,
616
  "step": 2100
617
  },
618
  {
619
- "epoch": 3.4954732510288067,
620
- "grad_norm": 1.4353647232055664,
621
- "learning_rate": 3.5120000000000004e-06,
622
- "loss": 0.0231,
623
  "step": 2125
624
  },
625
  {
626
- "epoch": 3.5366255144032923,
627
- "grad_norm": 0.97385573387146,
628
- "learning_rate": 3.412e-06,
629
- "loss": 0.0224,
630
  "step": 2150
631
  },
632
  {
633
- "epoch": 3.5777777777777775,
634
- "grad_norm": 1.3879934549331665,
635
- "learning_rate": 3.3120000000000002e-06,
636
- "loss": 0.0247,
637
  "step": 2175
638
  },
639
  {
640
- "epoch": 3.6189300411522636,
641
- "grad_norm": 1.5851539373397827,
642
- "learning_rate": 3.212e-06,
643
- "loss": 0.0239,
644
  "step": 2200
645
  },
646
  {
647
- "epoch": 3.6600823045267488,
648
- "grad_norm": 0.3798752725124359,
649
- "learning_rate": 3.112e-06,
650
- "loss": 0.0268,
651
  "step": 2225
652
  },
653
  {
654
- "epoch": 3.701234567901235,
655
- "grad_norm": 2.217914342880249,
656
- "learning_rate": 3.0120000000000006e-06,
657
- "loss": 0.0226,
658
  "step": 2250
659
  },
660
  {
661
- "epoch": 3.74238683127572,
662
- "grad_norm": 1.0964093208312988,
663
- "learning_rate": 2.9120000000000002e-06,
664
- "loss": 0.0345,
665
  "step": 2275
666
  },
667
  {
668
- "epoch": 3.7835390946502057,
669
- "grad_norm": 0.7668254375457764,
670
- "learning_rate": 2.8120000000000004e-06,
671
- "loss": 0.0247,
672
  "step": 2300
673
  },
674
  {
675
- "epoch": 3.8246913580246913,
676
- "grad_norm": 2.1343576908111572,
677
- "learning_rate": 2.712e-06,
678
- "loss": 0.0188,
679
  "step": 2325
680
  },
681
  {
682
- "epoch": 3.865843621399177,
683
- "grad_norm": 3.1728477478027344,
684
- "learning_rate": 2.612e-06,
685
- "loss": 0.0304,
686
  "step": 2350
687
  },
688
  {
689
- "epoch": 3.9069958847736626,
690
- "grad_norm": 0.8707403540611267,
691
- "learning_rate": 2.512e-06,
692
- "loss": 0.0287,
693
  "step": 2375
694
  },
695
  {
696
- "epoch": 3.948148148148148,
697
- "grad_norm": 1.3772474527359009,
698
- "learning_rate": 2.4120000000000004e-06,
699
- "loss": 0.021,
700
  "step": 2400
701
  },
702
  {
703
- "epoch": 3.989300411522634,
704
- "grad_norm": 0.8075869083404541,
705
- "learning_rate": 2.312e-06,
706
- "loss": 0.0224,
707
  "step": 2425
708
  },
709
  {
710
- "epoch": 4.029629629629629,
711
- "grad_norm": 0.8268884420394897,
712
- "learning_rate": 2.212e-06,
713
- "loss": 0.0114,
714
  "step": 2450
715
  },
716
  {
717
- "epoch": 4.070781893004115,
718
- "grad_norm": 0.6684131026268005,
719
- "learning_rate": 2.1120000000000003e-06,
720
- "loss": 0.008,
721
  "step": 2475
722
  },
723
  {
724
- "epoch": 4.1119341563786005,
725
- "grad_norm": 0.2903365194797516,
726
- "learning_rate": 2.012e-06,
727
- "loss": 0.0148,
728
  "step": 2500
729
  },
730
  {
731
- "epoch": 4.153086419753087,
732
- "grad_norm": 2.815356731414795,
733
- "learning_rate": 1.912e-06,
734
- "loss": 0.0112,
735
  "step": 2525
736
  },
737
  {
738
- "epoch": 4.194238683127572,
739
- "grad_norm": 0.7152761816978455,
740
- "learning_rate": 1.8120000000000002e-06,
741
- "loss": 0.0135,
742
  "step": 2550
743
  },
744
  {
745
- "epoch": 4.235390946502058,
746
- "grad_norm": 0.7304953932762146,
747
- "learning_rate": 1.712e-06,
748
- "loss": 0.0076,
749
  "step": 2575
750
  },
751
  {
752
- "epoch": 4.276543209876543,
753
- "grad_norm": 1.0283336639404297,
754
- "learning_rate": 1.6120000000000002e-06,
755
- "loss": 0.0138,
756
  "step": 2600
757
  },
758
  {
759
- "epoch": 4.317695473251029,
760
- "grad_norm": 0.19157454371452332,
761
- "learning_rate": 1.512e-06,
762
- "loss": 0.0059,
763
  "step": 2625
764
  },
765
  {
766
- "epoch": 4.358847736625514,
767
- "grad_norm": 0.8817554116249084,
768
- "learning_rate": 1.412e-06,
769
- "loss": 0.0059,
770
  "step": 2650
771
  },
772
  {
773
- "epoch": 4.4,
774
- "grad_norm": 0.772783637046814,
775
- "learning_rate": 1.3120000000000003e-06,
776
- "loss": 0.0067,
777
  "step": 2675
778
  },
779
  {
780
- "epoch": 4.4411522633744855,
781
- "grad_norm": 1.7318344116210938,
782
- "learning_rate": 1.2120000000000002e-06,
783
- "loss": 0.0091,
784
  "step": 2700
785
  },
786
  {
787
- "epoch": 4.482304526748971,
788
- "grad_norm": 0.4040825366973877,
789
- "learning_rate": 1.1120000000000001e-06,
790
- "loss": 0.0075,
791
  "step": 2725
792
  },
793
  {
794
- "epoch": 4.523456790123457,
795
- "grad_norm": 0.22325092554092407,
796
- "learning_rate": 1.012e-06,
797
- "loss": 0.0068,
798
  "step": 2750
799
  },
800
  {
801
- "epoch": 4.564609053497943,
802
- "grad_norm": 0.44631338119506836,
803
- "learning_rate": 9.120000000000001e-07,
804
- "loss": 0.0065,
805
  "step": 2775
806
  },
807
  {
808
- "epoch": 4.605761316872428,
809
- "grad_norm": 1.0319997072219849,
810
- "learning_rate": 8.12e-07,
811
- "loss": 0.0071,
812
  "step": 2800
813
  },
814
  {
815
- "epoch": 4.646913580246913,
816
- "grad_norm": 1.9721506834030151,
817
- "learning_rate": 7.12e-07,
818
- "loss": 0.0119,
819
  "step": 2825
820
  },
821
  {
822
- "epoch": 4.688065843621399,
823
- "grad_norm": 0.18514037132263184,
824
- "learning_rate": 6.12e-07,
825
- "loss": 0.0067,
826
  "step": 2850
827
  },
828
  {
829
- "epoch": 4.7292181069958845,
830
- "grad_norm": 0.5566908121109009,
831
- "learning_rate": 5.12e-07,
832
- "loss": 0.0077,
833
  "step": 2875
834
  },
835
  {
836
- "epoch": 4.770370370370371,
837
- "grad_norm": 0.2684698700904846,
838
- "learning_rate": 4.1200000000000004e-07,
839
- "loss": 0.0093,
840
  "step": 2900
841
  },
842
  {
843
- "epoch": 4.811522633744856,
844
- "grad_norm": 0.36519676446914673,
845
- "learning_rate": 3.12e-07,
846
- "loss": 0.0077,
847
  "step": 2925
848
  },
849
  {
850
- "epoch": 4.852674897119342,
851
- "grad_norm": 0.5899785161018372,
852
- "learning_rate": 2.1200000000000002e-07,
853
- "loss": 0.0059,
854
  "step": 2950
855
  },
856
  {
857
- "epoch": 4.893827160493827,
858
- "grad_norm": 0.5213066339492798,
859
- "learning_rate": 1.1200000000000001e-07,
860
- "loss": 0.0051,
861
  "step": 2975
862
  },
863
  {
864
- "epoch": 4.934979423868313,
865
- "grad_norm": 0.2481735646724701,
866
- "learning_rate": 1.2e-08,
867
- "loss": 0.0066,
868
  "step": 3000
869
  },
870
  {
871
- "epoch": 4.934979423868313,
872
- "eval_loss": 0.2823709547519684,
873
- "eval_runtime": 1578.0998,
874
- "eval_samples_per_second": 1.66,
875
- "eval_steps_per_second": 0.415,
876
- "eval_wer": 0.12251221112488153,
877
  "step": 3000
878
  },
879
  {
880
- "epoch": 4.934979423868313,
881
- "step": 3000,
882
- "total_flos": 4.894412894502912e+19,
883
- "train_loss": 0.1704294042487939,
884
- "train_runtime": 13123.9045,
885
- "train_samples_per_second": 3.657,
886
- "train_steps_per_second": 0.229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
887
  }
888
  ],
889
  "logging_steps": 25,
890
- "max_steps": 3000,
891
  "num_input_tokens_seen": 0,
892
- "num_train_epochs": 5,
893
  "save_steps": 1000,
894
  "stateful_callbacks": {
895
  "TrainerControl": {
@@ -903,7 +1481,7 @@
903
  "attributes": {}
904
  }
905
  },
906
- "total_flos": 4.894412894502912e+19,
907
  "train_batch_size": 4,
908
  "trial_name": null,
909
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 14.79466271312083,
6
  "eval_steps": 1000,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.07412898443291327,
14
+ "grad_norm": 11.929125785827637,
15
+ "learning_rate": 4.4e-07,
16
+ "loss": 1.6381,
17
  "step": 25
18
  },
19
  {
20
+ "epoch": 0.14825796886582654,
21
+ "grad_norm": 8.018507957458496,
22
+ "learning_rate": 9.400000000000001e-07,
23
+ "loss": 1.3896,
24
  "step": 50
25
  },
26
  {
27
+ "epoch": 0.2223869532987398,
28
+ "grad_norm": 6.990742206573486,
29
  "learning_rate": 1.44e-06,
30
+ "loss": 1.2335,
31
  "step": 75
32
  },
33
  {
34
+ "epoch": 0.2965159377316531,
35
+ "grad_norm": 7.199395656585693,
36
  "learning_rate": 1.94e-06,
37
+ "loss": 1.0625,
38
  "step": 100
39
  },
40
  {
41
+ "epoch": 0.37064492216456635,
42
+ "grad_norm": 5.8580546379089355,
43
  "learning_rate": 2.4400000000000004e-06,
44
+ "loss": 0.9354,
45
  "step": 125
46
  },
47
  {
48
+ "epoch": 0.4447739065974796,
49
+ "grad_norm": 6.414823055267334,
50
  "learning_rate": 2.9400000000000002e-06,
51
+ "loss": 0.8125,
52
  "step": 150
53
  },
54
  {
55
+ "epoch": 0.5189028910303929,
56
+ "grad_norm": 6.512091159820557,
57
  "learning_rate": 3.44e-06,
58
+ "loss": 0.7764,
59
  "step": 175
60
  },
61
  {
62
+ "epoch": 0.5930318754633062,
63
+ "grad_norm": 6.021186351776123,
64
  "learning_rate": 3.94e-06,
65
+ "loss": 0.7381,
66
  "step": 200
67
  },
68
  {
69
+ "epoch": 0.6671608598962194,
70
+ "grad_norm": 5.814141273498535,
71
  "learning_rate": 4.440000000000001e-06,
72
+ "loss": 0.6923,
73
  "step": 225
74
  },
75
  {
76
+ "epoch": 0.7412898443291327,
77
+ "grad_norm": 5.2472968101501465,
78
  "learning_rate": 4.94e-06,
79
+ "loss": 0.6586,
80
  "step": 250
81
  },
82
  {
83
+ "epoch": 0.815418828762046,
84
+ "grad_norm": 5.4379401206970215,
85
  "learning_rate": 5.4400000000000004e-06,
86
+ "loss": 0.6041,
87
  "step": 275
88
  },
89
  {
90
+ "epoch": 0.8895478131949592,
91
+ "grad_norm": 5.554481029510498,
92
  "learning_rate": 5.94e-06,
93
+ "loss": 0.5964,
94
  "step": 300
95
  },
96
  {
97
+ "epoch": 0.9636767976278725,
98
+ "grad_norm": 6.5946502685546875,
99
  "learning_rate": 6.440000000000001e-06,
100
+ "loss": 0.5586,
101
  "step": 325
102
  },
103
  {
104
+ "epoch": 1.0355819125277983,
105
+ "grad_norm": 4.164698600769043,
106
  "learning_rate": 6.9400000000000005e-06,
107
+ "loss": 0.4838,
108
  "step": 350
109
  },
110
  {
111
+ "epoch": 1.1097108969607117,
112
+ "grad_norm": 6.609652519226074,
113
  "learning_rate": 7.440000000000001e-06,
114
+ "loss": 0.4744,
115
  "step": 375
116
  },
117
  {
118
+ "epoch": 1.1838398813936248,
119
+ "grad_norm": 4.401813507080078,
120
  "learning_rate": 7.94e-06,
121
+ "loss": 0.4447,
122
  "step": 400
123
  },
124
  {
125
+ "epoch": 1.2579688658265382,
126
+ "grad_norm": 4.387920379638672,
127
  "learning_rate": 8.44e-06,
128
+ "loss": 0.4425,
129
  "step": 425
130
  },
131
  {
132
+ "epoch": 1.3320978502594514,
133
+ "grad_norm": 4.439324855804443,
134
  "learning_rate": 8.94e-06,
135
+ "loss": 0.3978,
136
  "step": 450
137
  },
138
  {
139
+ "epoch": 1.4062268346923648,
140
+ "grad_norm": 4.0201826095581055,
141
  "learning_rate": 9.440000000000001e-06,
142
+ "loss": 0.4059,
143
  "step": 475
144
  },
145
  {
146
+ "epoch": 1.480355819125278,
147
+ "grad_norm": 4.707728385925293,
148
  "learning_rate": 9.940000000000001e-06,
149
+ "loss": 0.3959,
150
  "step": 500
151
  },
152
  {
153
+ "epoch": 1.5544848035581913,
154
+ "grad_norm": 4.224379062652588,
155
+ "learning_rate": 9.951111111111111e-06,
156
+ "loss": 0.3747,
157
  "step": 525
158
  },
159
  {
160
+ "epoch": 1.6286137879911045,
161
+ "grad_norm": 4.27752161026001,
162
+ "learning_rate": 9.895555555555557e-06,
163
+ "loss": 0.3749,
164
  "step": 550
165
  },
166
  {
167
+ "epoch": 1.7027427724240178,
168
+ "grad_norm": 4.656911373138428,
169
+ "learning_rate": 9.84e-06,
170
+ "loss": 0.3667,
171
  "step": 575
172
  },
173
  {
174
+ "epoch": 1.7768717568569312,
175
+ "grad_norm": 4.724191188812256,
176
+ "learning_rate": 9.784444444444445e-06,
177
+ "loss": 0.3474,
178
  "step": 600
179
  },
180
  {
181
+ "epoch": 1.8510007412898444,
182
+ "grad_norm": 4.367356300354004,
183
+ "learning_rate": 9.72888888888889e-06,
184
+ "loss": 0.3445,
185
  "step": 625
186
  },
187
  {
188
+ "epoch": 1.9251297257227575,
189
+ "grad_norm": 4.204794406890869,
190
+ "learning_rate": 9.673333333333334e-06,
191
+ "loss": 0.3353,
192
  "step": 650
193
  },
194
  {
195
+ "epoch": 1.999258710155671,
196
+ "grad_norm": 3.796111583709717,
197
+ "learning_rate": 9.617777777777778e-06,
198
+ "loss": 0.3136,
199
  "step": 675
200
  },
201
  {
202
+ "epoch": 2.0711638250555966,
203
+ "grad_norm": 3.014369249343872,
204
+ "learning_rate": 9.562222222222223e-06,
205
+ "loss": 0.2109,
206
  "step": 700
207
  },
208
  {
209
+ "epoch": 2.14529280948851,
210
+ "grad_norm": 3.4962964057922363,
211
+ "learning_rate": 9.506666666666667e-06,
212
+ "loss": 0.2112,
213
  "step": 725
214
  },
215
  {
216
+ "epoch": 2.2194217939214234,
217
+ "grad_norm": 3.688762903213501,
218
+ "learning_rate": 9.451111111111112e-06,
219
+ "loss": 0.2122,
220
  "step": 750
221
  },
222
  {
223
+ "epoch": 2.2935507783543367,
224
+ "grad_norm": 4.557786464691162,
225
+ "learning_rate": 9.395555555555556e-06,
226
+ "loss": 0.2107,
227
  "step": 775
228
  },
229
  {
230
+ "epoch": 2.3676797627872497,
231
+ "grad_norm": 3.3230926990509033,
232
+ "learning_rate": 9.340000000000002e-06,
233
+ "loss": 0.2115,
234
  "step": 800
235
  },
236
  {
237
+ "epoch": 2.441808747220163,
238
+ "grad_norm": 3.251685380935669,
239
+ "learning_rate": 9.284444444444444e-06,
240
+ "loss": 0.2021,
241
  "step": 825
242
  },
243
  {
244
+ "epoch": 2.5159377316530764,
245
+ "grad_norm": 3.852593421936035,
246
+ "learning_rate": 9.22888888888889e-06,
247
+ "loss": 0.1999,
248
  "step": 850
249
  },
250
  {
251
+ "epoch": 2.59006671608599,
252
+ "grad_norm": 3.0748980045318604,
253
+ "learning_rate": 9.173333333333334e-06,
254
+ "loss": 0.1936,
255
  "step": 875
256
  },
257
  {
258
+ "epoch": 2.6641957005189028,
259
+ "grad_norm": 2.834657907485962,
260
+ "learning_rate": 9.117777777777778e-06,
261
+ "loss": 0.2031,
262
  "step": 900
263
  },
264
  {
265
+ "epoch": 2.738324684951816,
266
+ "grad_norm": 2.9382002353668213,
267
+ "learning_rate": 9.062222222222224e-06,
268
+ "loss": 0.199,
269
  "step": 925
270
  },
271
  {
272
+ "epoch": 2.8124536693847295,
273
+ "grad_norm": 2.9481730461120605,
274
+ "learning_rate": 9.006666666666666e-06,
275
+ "loss": 0.1867,
276
  "step": 950
277
  },
278
  {
279
+ "epoch": 2.8865826538176425,
280
+ "grad_norm": 3.9300479888916016,
281
+ "learning_rate": 8.951111111111112e-06,
282
+ "loss": 0.203,
283
  "step": 975
284
  },
285
  {
286
+ "epoch": 2.960711638250556,
287
+ "grad_norm": 3.3928661346435547,
288
+ "learning_rate": 8.895555555555556e-06,
289
+ "loss": 0.2023,
290
  "step": 1000
291
  },
292
  {
293
+ "epoch": 2.960711638250556,
294
+ "eval_loss": 0.3293725550174713,
295
+ "eval_runtime": 936.3299,
296
+ "eval_samples_per_second": 3.081,
297
+ "eval_steps_per_second": 0.386,
298
+ "eval_wer": 0.18741347189225221,
299
  "step": 1000
300
  },
301
  {
302
+ "epoch": 3.032616753150482,
303
+ "grad_norm": 2.452481269836426,
304
+ "learning_rate": 8.84e-06,
305
+ "loss": 0.152,
306
  "step": 1025
307
  },
308
  {
309
+ "epoch": 3.106745737583395,
310
+ "grad_norm": 2.802464485168457,
311
+ "learning_rate": 8.784444444444446e-06,
312
+ "loss": 0.1076,
313
  "step": 1050
314
  },
315
  {
316
+ "epoch": 3.1808747220163083,
317
+ "grad_norm": 3.293215751647949,
318
+ "learning_rate": 8.72888888888889e-06,
319
+ "loss": 0.1114,
320
  "step": 1075
321
  },
322
  {
323
+ "epoch": 3.2550037064492217,
324
+ "grad_norm": 2.4673008918762207,
325
+ "learning_rate": 8.673333333333334e-06,
326
+ "loss": 0.1075,
327
  "step": 1100
328
  },
329
  {
330
+ "epoch": 3.329132690882135,
331
+ "grad_norm": 2.8392369747161865,
332
+ "learning_rate": 8.617777777777778e-06,
333
+ "loss": 0.1112,
334
  "step": 1125
335
  },
336
  {
337
+ "epoch": 3.403261675315048,
338
+ "grad_norm": 2.4692773818969727,
339
+ "learning_rate": 8.562222222222224e-06,
340
+ "loss": 0.1109,
341
  "step": 1150
342
  },
343
  {
344
+ "epoch": 3.4773906597479614,
345
+ "grad_norm": 3.809983968734741,
346
+ "learning_rate": 8.506666666666668e-06,
347
+ "loss": 0.1173,
348
  "step": 1175
349
  },
350
  {
351
+ "epoch": 3.5515196441808747,
352
+ "grad_norm": 2.7140321731567383,
353
+ "learning_rate": 8.451111111111112e-06,
354
+ "loss": 0.1131,
355
  "step": 1200
356
  },
357
  {
358
+ "epoch": 3.625648628613788,
359
+ "grad_norm": 2.5393877029418945,
360
+ "learning_rate": 8.395555555555557e-06,
361
+ "loss": 0.1077,
362
  "step": 1225
363
  },
364
  {
365
+ "epoch": 3.6997776130467015,
366
+ "grad_norm": 2.905407667160034,
367
+ "learning_rate": 8.34e-06,
368
+ "loss": 0.1054,
369
  "step": 1250
370
  },
371
  {
372
+ "epoch": 3.7739065974796144,
373
+ "grad_norm": 2.8358991146087646,
374
+ "learning_rate": 8.284444444444446e-06,
375
+ "loss": 0.1073,
376
  "step": 1275
377
  },
378
  {
379
+ "epoch": 3.848035581912528,
380
+ "grad_norm": 2.4956672191619873,
381
+ "learning_rate": 8.22888888888889e-06,
382
+ "loss": 0.1052,
383
  "step": 1300
384
  },
385
  {
386
+ "epoch": 3.922164566345441,
387
+ "grad_norm": 3.1253950595855713,
388
+ "learning_rate": 8.173333333333334e-06,
389
+ "loss": 0.1064,
390
  "step": 1325
391
  },
392
  {
393
+ "epoch": 3.996293550778354,
394
+ "grad_norm": 3.244260549545288,
395
+ "learning_rate": 8.11777777777778e-06,
396
+ "loss": 0.0973,
397
  "step": 1350
398
  },
399
  {
400
+ "epoch": 4.06819866567828,
401
+ "grad_norm": 1.2010555267333984,
402
+ "learning_rate": 8.062222222222222e-06,
403
+ "loss": 0.0602,
404
  "step": 1375
405
  },
406
  {
407
+ "epoch": 4.142327650111193,
408
+ "grad_norm": 1.965174913406372,
409
+ "learning_rate": 8.006666666666667e-06,
410
+ "loss": 0.0607,
411
  "step": 1400
412
  },
413
  {
414
+ "epoch": 4.216456634544107,
415
+ "grad_norm": 1.7773408889770508,
416
+ "learning_rate": 7.951111111111111e-06,
417
+ "loss": 0.0539,
418
  "step": 1425
419
  },
420
  {
421
+ "epoch": 4.29058561897702,
422
+ "grad_norm": 2.6301095485687256,
423
+ "learning_rate": 7.895555555555557e-06,
424
+ "loss": 0.0606,
425
  "step": 1450
426
  },
427
  {
428
+ "epoch": 4.364714603409933,
429
+ "grad_norm": 1.90291428565979,
430
+ "learning_rate": 7.840000000000001e-06,
431
+ "loss": 0.0584,
432
  "step": 1475
433
  },
434
  {
435
+ "epoch": 4.438843587842847,
436
+ "grad_norm": 2.0360007286071777,
437
+ "learning_rate": 7.784444444444445e-06,
438
+ "loss": 0.0553,
439
  "step": 1500
440
  },
441
  {
442
+ "epoch": 4.51297257227576,
443
+ "grad_norm": 2.2208218574523926,
444
+ "learning_rate": 7.72888888888889e-06,
445
+ "loss": 0.0656,
446
  "step": 1525
447
  },
448
  {
449
+ "epoch": 4.5871015567086735,
450
+ "grad_norm": 1.9523277282714844,
451
+ "learning_rate": 7.673333333333333e-06,
452
+ "loss": 0.0537,
453
  "step": 1550
454
  },
455
  {
456
+ "epoch": 4.661230541141586,
457
+ "grad_norm": 1.7108356952667236,
458
+ "learning_rate": 7.617777777777778e-06,
459
+ "loss": 0.0575,
460
  "step": 1575
461
  },
462
  {
463
+ "epoch": 4.735359525574499,
464
+ "grad_norm": 1.8699008226394653,
465
+ "learning_rate": 7.562222222222223e-06,
466
+ "loss": 0.0548,
467
  "step": 1600
468
  },
469
  {
470
+ "epoch": 4.809488510007413,
471
+ "grad_norm": 2.0190742015838623,
472
+ "learning_rate": 7.506666666666668e-06,
473
+ "loss": 0.0548,
474
  "step": 1625
475
  },
476
  {
477
+ "epoch": 4.883617494440326,
478
+ "grad_norm": 1.592809796333313,
479
+ "learning_rate": 7.451111111111111e-06,
480
+ "loss": 0.0536,
481
  "step": 1650
482
  },
483
  {
484
+ "epoch": 4.957746478873239,
485
+ "grad_norm": 1.7714742422103882,
486
+ "learning_rate": 7.395555555555556e-06,
487
+ "loss": 0.0522,
488
  "step": 1675
489
  },
490
  {
491
+ "epoch": 5.029651593773165,
492
+ "grad_norm": 0.9238072037696838,
493
+ "learning_rate": 7.340000000000001e-06,
494
+ "loss": 0.0427,
495
  "step": 1700
496
  },
497
  {
498
+ "epoch": 5.103780578206079,
499
+ "grad_norm": 1.1125034093856812,
500
+ "learning_rate": 7.284444444444445e-06,
501
+ "loss": 0.0269,
502
  "step": 1725
503
  },
504
  {
505
+ "epoch": 5.177909562638992,
506
+ "grad_norm": 1.247423768043518,
507
+ "learning_rate": 7.22888888888889e-06,
508
+ "loss": 0.0257,
509
  "step": 1750
510
  },
511
  {
512
+ "epoch": 5.252038547071905,
513
+ "grad_norm": 1.1966558694839478,
514
+ "learning_rate": 7.173333333333335e-06,
515
+ "loss": 0.029,
516
  "step": 1775
517
  },
518
  {
519
+ "epoch": 5.326167531504819,
520
+ "grad_norm": 0.9568462371826172,
521
+ "learning_rate": 7.117777777777778e-06,
522
+ "loss": 0.0294,
523
  "step": 1800
524
  },
525
  {
526
+ "epoch": 5.400296515937732,
527
+ "grad_norm": 1.0882899761199951,
528
+ "learning_rate": 7.062222222222223e-06,
529
+ "loss": 0.028,
530
  "step": 1825
531
  },
532
  {
533
+ "epoch": 5.474425500370645,
534
+ "grad_norm": 1.1803560256958008,
535
+ "learning_rate": 7.006666666666667e-06,
536
+ "loss": 0.0295,
537
  "step": 1850
538
  },
539
  {
540
+ "epoch": 5.548554484803558,
541
+ "grad_norm": 1.2876580953598022,
542
+ "learning_rate": 6.951111111111112e-06,
543
+ "loss": 0.0334,
544
  "step": 1875
545
  },
546
  {
547
+ "epoch": 5.622683469236471,
548
+ "grad_norm": 1.7007198333740234,
549
+ "learning_rate": 6.8955555555555565e-06,
550
+ "loss": 0.0294,
551
  "step": 1900
552
  },
553
  {
554
+ "epoch": 5.696812453669384,
555
+ "grad_norm": 0.9945253729820251,
556
+ "learning_rate": 6.8400000000000014e-06,
557
+ "loss": 0.0287,
558
  "step": 1925
559
  },
560
  {
561
+ "epoch": 5.770941438102298,
562
+ "grad_norm": 1.3555738925933838,
563
+ "learning_rate": 6.784444444444445e-06,
564
+ "loss": 0.0278,
565
  "step": 1950
566
  },
567
  {
568
+ "epoch": 5.845070422535211,
569
+ "grad_norm": 1.6404248476028442,
570
+ "learning_rate": 6.7288888888888895e-06,
571
+ "loss": 0.027,
572
  "step": 1975
573
  },
574
  {
575
+ "epoch": 5.919199406968125,
576
+ "grad_norm": 1.1645821332931519,
577
+ "learning_rate": 6.6733333333333335e-06,
578
+ "loss": 0.0309,
579
  "step": 2000
580
  },
581
  {
582
+ "epoch": 5.919199406968125,
583
+ "eval_loss": 0.3413762152194977,
584
+ "eval_runtime": 911.1637,
585
+ "eval_samples_per_second": 3.166,
586
+ "eval_steps_per_second": 0.396,
587
+ "eval_wer": 0.17174013365922025,
588
  "step": 2000
589
  },
590
  {
591
+ "epoch": 5.993328391401038,
592
+ "grad_norm": 1.239682674407959,
593
+ "learning_rate": 6.617777777777778e-06,
594
+ "loss": 0.0246,
595
  "step": 2025
596
  },
597
  {
598
+ "epoch": 6.065233506300964,
599
+ "grad_norm": 0.5780806541442871,
600
+ "learning_rate": 6.562222222222223e-06,
601
+ "loss": 0.0158,
602
  "step": 2050
603
  },
604
  {
605
+ "epoch": 6.139362490733877,
606
+ "grad_norm": 0.7804180383682251,
607
+ "learning_rate": 6.5066666666666665e-06,
608
+ "loss": 0.0149,
609
  "step": 2075
610
  },
611
  {
612
+ "epoch": 6.21349147516679,
613
+ "grad_norm": 1.2513881921768188,
614
+ "learning_rate": 6.451111111111111e-06,
615
+ "loss": 0.0186,
616
  "step": 2100
617
  },
618
  {
619
+ "epoch": 6.287620459599704,
620
+ "grad_norm": 1.0758005380630493,
621
+ "learning_rate": 6.395555555555556e-06,
622
+ "loss": 0.0176,
623
  "step": 2125
624
  },
625
  {
626
+ "epoch": 6.3617494440326166,
627
+ "grad_norm": 1.6794716119766235,
628
+ "learning_rate": 6.34e-06,
629
+ "loss": 0.0166,
630
  "step": 2150
631
  },
632
  {
633
+ "epoch": 6.43587842846553,
634
+ "grad_norm": 0.4483010768890381,
635
+ "learning_rate": 6.284444444444445e-06,
636
+ "loss": 0.0116,
637
  "step": 2175
638
  },
639
  {
640
+ "epoch": 6.510007412898443,
641
+ "grad_norm": 0.6224784255027771,
642
+ "learning_rate": 6.22888888888889e-06,
643
+ "loss": 0.0122,
644
  "step": 2200
645
  },
646
  {
647
+ "epoch": 6.584136397331356,
648
+ "grad_norm": 0.5704225897789001,
649
+ "learning_rate": 6.173333333333333e-06,
650
+ "loss": 0.0138,
651
  "step": 2225
652
  },
653
  {
654
+ "epoch": 6.65826538176427,
655
+ "grad_norm": 0.976425051689148,
656
+ "learning_rate": 6.117777777777778e-06,
657
+ "loss": 0.0142,
658
  "step": 2250
659
  },
660
  {
661
+ "epoch": 6.732394366197183,
662
+ "grad_norm": 1.829641580581665,
663
+ "learning_rate": 6.062222222222223e-06,
664
+ "loss": 0.0139,
665
  "step": 2275
666
  },
667
  {
668
+ "epoch": 6.806523350630096,
669
+ "grad_norm": 0.645282506942749,
670
+ "learning_rate": 6.006666666666667e-06,
671
+ "loss": 0.0154,
672
  "step": 2300
673
  },
674
  {
675
+ "epoch": 6.88065233506301,
676
+ "grad_norm": 0.9580066204071045,
677
+ "learning_rate": 5.951111111111112e-06,
678
+ "loss": 0.0119,
679
  "step": 2325
680
  },
681
  {
682
+ "epoch": 6.954781319495923,
683
+ "grad_norm": 1.3833215236663818,
684
+ "learning_rate": 5.895555555555557e-06,
685
+ "loss": 0.0163,
686
  "step": 2350
687
  },
688
  {
689
+ "epoch": 7.026686434395849,
690
+ "grad_norm": 0.3075716495513916,
691
+ "learning_rate": 5.84e-06,
692
+ "loss": 0.0115,
693
  "step": 2375
694
  },
695
  {
696
+ "epoch": 7.100815418828762,
697
+ "grad_norm": 1.460079312324524,
698
+ "learning_rate": 5.784444444444445e-06,
699
+ "loss": 0.0098,
700
  "step": 2400
701
  },
702
  {
703
+ "epoch": 7.174944403261676,
704
+ "grad_norm": 0.24449419975280762,
705
+ "learning_rate": 5.72888888888889e-06,
706
+ "loss": 0.0082,
707
  "step": 2425
708
  },
709
  {
710
+ "epoch": 7.2490733876945885,
711
+ "grad_norm": 0.4475979208946228,
712
+ "learning_rate": 5.673333333333334e-06,
713
+ "loss": 0.008,
714
  "step": 2450
715
  },
716
  {
717
+ "epoch": 7.3232023721275015,
718
+ "grad_norm": 0.9485690593719482,
719
+ "learning_rate": 5.617777777777779e-06,
720
+ "loss": 0.0086,
721
  "step": 2475
722
  },
723
  {
724
+ "epoch": 7.397331356560415,
725
+ "grad_norm": 0.4818005859851837,
726
+ "learning_rate": 5.562222222222222e-06,
727
+ "loss": 0.0079,
728
  "step": 2500
729
  },
730
  {
731
+ "epoch": 7.471460340993328,
732
+ "grad_norm": 0.2728360891342163,
733
+ "learning_rate": 5.506666666666667e-06,
734
+ "loss": 0.0071,
735
  "step": 2525
736
  },
737
  {
738
+ "epoch": 7.545589325426242,
739
+ "grad_norm": 0.36738815903663635,
740
+ "learning_rate": 5.451111111111112e-06,
741
+ "loss": 0.0071,
742
  "step": 2550
743
  },
744
  {
745
+ "epoch": 7.619718309859155,
746
+ "grad_norm": 0.5995059609413147,
747
+ "learning_rate": 5.3955555555555565e-06,
748
+ "loss": 0.0071,
749
  "step": 2575
750
  },
751
  {
752
+ "epoch": 7.693847294292068,
753
+ "grad_norm": 0.6667615175247192,
754
+ "learning_rate": 5.3400000000000005e-06,
755
+ "loss": 0.0071,
756
  "step": 2600
757
  },
758
  {
759
+ "epoch": 7.767976278724982,
760
+ "grad_norm": 0.9709383845329285,
761
+ "learning_rate": 5.2844444444444454e-06,
762
+ "loss": 0.0077,
763
  "step": 2625
764
  },
765
  {
766
+ "epoch": 7.842105263157895,
767
+ "grad_norm": 0.9174025058746338,
768
+ "learning_rate": 5.228888888888889e-06,
769
+ "loss": 0.0064,
770
  "step": 2650
771
  },
772
  {
773
+ "epoch": 7.916234247590808,
774
+ "grad_norm": 0.4343569576740265,
775
+ "learning_rate": 5.1733333333333335e-06,
776
+ "loss": 0.006,
777
  "step": 2675
778
  },
779
  {
780
+ "epoch": 7.9903632320237215,
781
+ "grad_norm": 1.9002019166946411,
782
+ "learning_rate": 5.117777777777778e-06,
783
+ "loss": 0.0071,
784
  "step": 2700
785
  },
786
  {
787
+ "epoch": 8.062268346923647,
788
+ "grad_norm": 0.3372269570827484,
789
+ "learning_rate": 5.062222222222222e-06,
790
+ "loss": 0.005,
791
  "step": 2725
792
  },
793
  {
794
+ "epoch": 8.13639733135656,
795
+ "grad_norm": 0.8460608720779419,
796
+ "learning_rate": 5.006666666666667e-06,
797
+ "loss": 0.005,
798
  "step": 2750
799
  },
800
  {
801
+ "epoch": 8.210526315789474,
802
+ "grad_norm": 0.24673226475715637,
803
+ "learning_rate": 4.951111111111111e-06,
804
+ "loss": 0.0066,
805
  "step": 2775
806
  },
807
  {
808
+ "epoch": 8.284655300222386,
809
+ "grad_norm": 0.20106859505176544,
810
+ "learning_rate": 4.895555555555556e-06,
811
+ "loss": 0.0037,
812
  "step": 2800
813
  },
814
  {
815
+ "epoch": 8.3587842846553,
816
+ "grad_norm": 0.1510283499956131,
817
+ "learning_rate": 4.84e-06,
818
+ "loss": 0.0044,
819
  "step": 2825
820
  },
821
  {
822
+ "epoch": 8.432913269088214,
823
+ "grad_norm": 0.7809015512466431,
824
+ "learning_rate": 4.784444444444445e-06,
825
+ "loss": 0.0041,
826
  "step": 2850
827
  },
828
  {
829
+ "epoch": 8.507042253521126,
830
+ "grad_norm": 0.3642624318599701,
831
+ "learning_rate": 4.728888888888889e-06,
832
+ "loss": 0.0048,
833
  "step": 2875
834
  },
835
  {
836
+ "epoch": 8.58117123795404,
837
+ "grad_norm": 0.20495444536209106,
838
+ "learning_rate": 4.673333333333333e-06,
839
+ "loss": 0.0036,
840
  "step": 2900
841
  },
842
  {
843
+ "epoch": 8.655300222386954,
844
+ "grad_norm": 0.14625044167041779,
845
+ "learning_rate": 4.617777777777778e-06,
846
+ "loss": 0.0041,
847
  "step": 2925
848
  },
849
  {
850
+ "epoch": 8.729429206819866,
851
+ "grad_norm": 0.1600050926208496,
852
+ "learning_rate": 4.562222222222222e-06,
853
+ "loss": 0.0044,
854
  "step": 2950
855
  },
856
  {
857
+ "epoch": 8.80355819125278,
858
+ "grad_norm": 1.0638575553894043,
859
+ "learning_rate": 4.506666666666667e-06,
860
+ "loss": 0.0036,
861
  "step": 2975
862
  },
863
  {
864
+ "epoch": 8.877687175685693,
865
+ "grad_norm": 0.16608619689941406,
866
+ "learning_rate": 4.451111111111112e-06,
867
+ "loss": 0.0037,
868
  "step": 3000
869
  },
870
  {
871
+ "epoch": 8.877687175685693,
872
+ "eval_loss": 0.37816545367240906,
873
+ "eval_runtime": 954.3156,
874
+ "eval_samples_per_second": 3.023,
875
+ "eval_steps_per_second": 0.378,
876
+ "eval_wer": 0.17080007520467635,
877
  "step": 3000
878
  },
879
  {
880
+ "epoch": 8.951816160118607,
881
+ "grad_norm": 0.13698986172676086,
882
+ "learning_rate": 4.395555555555556e-06,
883
+ "loss": 0.0043,
884
+ "step": 3025
885
+ },
886
+ {
887
+ "epoch": 9.023721275018532,
888
+ "grad_norm": 0.15848596394062042,
889
+ "learning_rate": 4.34e-06,
890
+ "loss": 0.0044,
891
+ "step": 3050
892
+ },
893
+ {
894
+ "epoch": 9.097850259451446,
895
+ "grad_norm": 0.12134752422571182,
896
+ "learning_rate": 4.284444444444445e-06,
897
+ "loss": 0.0031,
898
+ "step": 3075
899
+ },
900
+ {
901
+ "epoch": 9.171979243884358,
902
+ "grad_norm": 0.13019242882728577,
903
+ "learning_rate": 4.228888888888889e-06,
904
+ "loss": 0.0027,
905
+ "step": 3100
906
+ },
907
+ {
908
+ "epoch": 9.246108228317272,
909
+ "grad_norm": 0.19179771840572357,
910
+ "learning_rate": 4.173333333333334e-06,
911
+ "loss": 0.0028,
912
+ "step": 3125
913
+ },
914
+ {
915
+ "epoch": 9.320237212750186,
916
+ "grad_norm": 0.20711077749729156,
917
+ "learning_rate": 4.117777777777779e-06,
918
+ "loss": 0.0027,
919
+ "step": 3150
920
+ },
921
+ {
922
+ "epoch": 9.394366197183098,
923
+ "grad_norm": 0.09309308230876923,
924
+ "learning_rate": 4.062222222222223e-06,
925
+ "loss": 0.0026,
926
+ "step": 3175
927
+ },
928
+ {
929
+ "epoch": 9.468495181616012,
930
+ "grad_norm": 0.09770819544792175,
931
+ "learning_rate": 4.006666666666667e-06,
932
+ "loss": 0.0035,
933
+ "step": 3200
934
+ },
935
+ {
936
+ "epoch": 9.542624166048926,
937
+ "grad_norm": 0.1151503324508667,
938
+ "learning_rate": 3.951111111111112e-06,
939
+ "loss": 0.0027,
940
+ "step": 3225
941
+ },
942
+ {
943
+ "epoch": 9.616753150481838,
944
+ "grad_norm": 0.1251019835472107,
945
+ "learning_rate": 3.895555555555556e-06,
946
+ "loss": 0.0029,
947
+ "step": 3250
948
+ },
949
+ {
950
+ "epoch": 9.690882134914752,
951
+ "grad_norm": 0.10979008674621582,
952
+ "learning_rate": 3.8400000000000005e-06,
953
+ "loss": 0.0027,
954
+ "step": 3275
955
+ },
956
+ {
957
+ "epoch": 9.765011119347665,
958
+ "grad_norm": 0.12528590857982635,
959
+ "learning_rate": 3.784444444444445e-06,
960
+ "loss": 0.0037,
961
+ "step": 3300
962
+ },
963
+ {
964
+ "epoch": 9.839140103780577,
965
+ "grad_norm": 0.12703803181648254,
966
+ "learning_rate": 3.728888888888889e-06,
967
+ "loss": 0.0026,
968
+ "step": 3325
969
+ },
970
+ {
971
+ "epoch": 9.913269088213491,
972
+ "grad_norm": 0.10044371336698532,
973
+ "learning_rate": 3.673333333333334e-06,
974
+ "loss": 0.003,
975
+ "step": 3350
976
+ },
977
+ {
978
+ "epoch": 9.987398072646405,
979
+ "grad_norm": 0.3393096625804901,
980
+ "learning_rate": 3.617777777777778e-06,
981
+ "loss": 0.0032,
982
+ "step": 3375
983
+ },
984
+ {
985
+ "epoch": 10.05930318754633,
986
+ "grad_norm": 0.08865969628095627,
987
+ "learning_rate": 3.5622222222222224e-06,
988
+ "loss": 0.0034,
989
+ "step": 3400
990
+ },
991
+ {
992
+ "epoch": 10.133432171979244,
993
+ "grad_norm": 0.07572653144598007,
994
+ "learning_rate": 3.5066666666666673e-06,
995
+ "loss": 0.0023,
996
+ "step": 3425
997
+ },
998
+ {
999
+ "epoch": 10.207561156412158,
1000
+ "grad_norm": 0.08611512184143066,
1001
+ "learning_rate": 3.4511111111111113e-06,
1002
+ "loss": 0.0021,
1003
+ "step": 3450
1004
+ },
1005
+ {
1006
+ "epoch": 10.28169014084507,
1007
+ "grad_norm": 0.09255647659301758,
1008
+ "learning_rate": 3.3955555555555558e-06,
1009
+ "loss": 0.0021,
1010
+ "step": 3475
1011
+ },
1012
+ {
1013
+ "epoch": 10.355819125277984,
1014
+ "grad_norm": 0.09140720963478088,
1015
+ "learning_rate": 3.3400000000000006e-06,
1016
+ "loss": 0.0021,
1017
+ "step": 3500
1018
+ },
1019
+ {
1020
+ "epoch": 10.429948109710898,
1021
+ "grad_norm": 0.08117499947547913,
1022
+ "learning_rate": 3.2844444444444447e-06,
1023
+ "loss": 0.002,
1024
+ "step": 3525
1025
+ },
1026
+ {
1027
+ "epoch": 10.50407709414381,
1028
+ "grad_norm": 0.07924593240022659,
1029
+ "learning_rate": 3.228888888888889e-06,
1030
+ "loss": 0.0021,
1031
+ "step": 3550
1032
+ },
1033
+ {
1034
+ "epoch": 10.578206078576724,
1035
+ "grad_norm": 0.09375474601984024,
1036
+ "learning_rate": 3.173333333333334e-06,
1037
+ "loss": 0.0023,
1038
+ "step": 3575
1039
+ },
1040
+ {
1041
+ "epoch": 10.652335063009637,
1042
+ "grad_norm": 0.08366698771715164,
1043
+ "learning_rate": 3.117777777777778e-06,
1044
+ "loss": 0.0022,
1045
+ "step": 3600
1046
+ },
1047
+ {
1048
+ "epoch": 10.72646404744255,
1049
+ "grad_norm": 0.09042704850435257,
1050
+ "learning_rate": 3.0622222222222225e-06,
1051
+ "loss": 0.0023,
1052
+ "step": 3625
1053
+ },
1054
+ {
1055
+ "epoch": 10.800593031875463,
1056
+ "grad_norm": 0.07148691266775131,
1057
+ "learning_rate": 3.0066666666666674e-06,
1058
+ "loss": 0.002,
1059
+ "step": 3650
1060
+ },
1061
+ {
1062
+ "epoch": 10.874722016308377,
1063
+ "grad_norm": 0.3031787574291229,
1064
+ "learning_rate": 2.9511111111111114e-06,
1065
+ "loss": 0.0022,
1066
+ "step": 3675
1067
+ },
1068
+ {
1069
+ "epoch": 10.94885100074129,
1070
+ "grad_norm": 0.09258675575256348,
1071
+ "learning_rate": 2.895555555555556e-06,
1072
+ "loss": 0.002,
1073
+ "step": 3700
1074
+ },
1075
+ {
1076
+ "epoch": 11.020756115641216,
1077
+ "grad_norm": 0.06253425031900406,
1078
+ "learning_rate": 2.84e-06,
1079
+ "loss": 0.002,
1080
+ "step": 3725
1081
+ },
1082
+ {
1083
+ "epoch": 11.094885100074128,
1084
+ "grad_norm": 0.057923316955566406,
1085
+ "learning_rate": 2.784444444444445e-06,
1086
+ "loss": 0.0017,
1087
+ "step": 3750
1088
+ },
1089
+ {
1090
+ "epoch": 11.169014084507042,
1091
+ "grad_norm": 0.06114868074655533,
1092
+ "learning_rate": 2.7288888888888893e-06,
1093
+ "loss": 0.0018,
1094
+ "step": 3775
1095
+ },
1096
+ {
1097
+ "epoch": 11.243143068939956,
1098
+ "grad_norm": 0.061181288212537766,
1099
+ "learning_rate": 2.6733333333333333e-06,
1100
+ "loss": 0.0017,
1101
+ "step": 3800
1102
+ },
1103
+ {
1104
+ "epoch": 11.317272053372868,
1105
+ "grad_norm": 0.06324835866689682,
1106
+ "learning_rate": 2.617777777777778e-06,
1107
+ "loss": 0.0018,
1108
+ "step": 3825
1109
+ },
1110
+ {
1111
+ "epoch": 11.391401037805782,
1112
+ "grad_norm": 0.07077573984861374,
1113
+ "learning_rate": 2.5622222222222226e-06,
1114
+ "loss": 0.0019,
1115
+ "step": 3850
1116
+ },
1117
+ {
1118
+ "epoch": 11.465530022238696,
1119
+ "grad_norm": 0.065274178981781,
1120
+ "learning_rate": 2.5066666666666667e-06,
1121
+ "loss": 0.0019,
1122
+ "step": 3875
1123
+ },
1124
+ {
1125
+ "epoch": 11.53965900667161,
1126
+ "grad_norm": 0.08122989535331726,
1127
+ "learning_rate": 2.451111111111111e-06,
1128
+ "loss": 0.0019,
1129
+ "step": 3900
1130
+ },
1131
+ {
1132
+ "epoch": 11.613787991104521,
1133
+ "grad_norm": 0.06203175708651543,
1134
+ "learning_rate": 2.3955555555555556e-06,
1135
+ "loss": 0.0018,
1136
+ "step": 3925
1137
+ },
1138
+ {
1139
+ "epoch": 11.687916975537435,
1140
+ "grad_norm": 0.08688808977603912,
1141
+ "learning_rate": 2.3400000000000005e-06,
1142
+ "loss": 0.0018,
1143
+ "step": 3950
1144
+ },
1145
+ {
1146
+ "epoch": 11.762045959970349,
1147
+ "grad_norm": 0.06490565836429596,
1148
+ "learning_rate": 2.2844444444444445e-06,
1149
+ "loss": 0.0017,
1150
+ "step": 3975
1151
+ },
1152
+ {
1153
+ "epoch": 11.836174944403261,
1154
+ "grad_norm": 0.06775514036417007,
1155
+ "learning_rate": 2.228888888888889e-06,
1156
+ "loss": 0.0018,
1157
+ "step": 4000
1158
+ },
1159
+ {
1160
+ "epoch": 11.836174944403261,
1161
+ "eval_loss": 0.39641496539115906,
1162
+ "eval_runtime": 944.1469,
1163
+ "eval_samples_per_second": 3.056,
1164
+ "eval_steps_per_second": 0.382,
1165
+ "eval_wer": 0.16803117575674706,
1166
+ "step": 4000
1167
+ },
1168
+ {
1169
+ "epoch": 11.910303928836175,
1170
+ "grad_norm": 0.06309942156076431,
1171
+ "learning_rate": 2.1733333333333334e-06,
1172
+ "loss": 0.0017,
1173
+ "step": 4025
1174
+ },
1175
+ {
1176
+ "epoch": 11.984432913269089,
1177
+ "grad_norm": 0.061430562287569046,
1178
+ "learning_rate": 2.117777777777778e-06,
1179
+ "loss": 0.0017,
1180
+ "step": 4050
1181
+ },
1182
+ {
1183
+ "epoch": 12.056338028169014,
1184
+ "grad_norm": 0.05334235355257988,
1185
+ "learning_rate": 2.0622222222222223e-06,
1186
+ "loss": 0.0016,
1187
+ "step": 4075
1188
+ },
1189
+ {
1190
+ "epoch": 12.130467012601928,
1191
+ "grad_norm": 0.057459134608507156,
1192
+ "learning_rate": 2.006666666666667e-06,
1193
+ "loss": 0.0016,
1194
+ "step": 4100
1195
+ },
1196
+ {
1197
+ "epoch": 12.20459599703484,
1198
+ "grad_norm": 0.047365590929985046,
1199
+ "learning_rate": 1.9511111111111113e-06,
1200
+ "loss": 0.0017,
1201
+ "step": 4125
1202
+ },
1203
+ {
1204
+ "epoch": 12.278724981467754,
1205
+ "grad_norm": 0.056059811264276505,
1206
+ "learning_rate": 1.8955555555555557e-06,
1207
+ "loss": 0.0015,
1208
+ "step": 4150
1209
+ },
1210
+ {
1211
+ "epoch": 12.352853965900668,
1212
+ "grad_norm": 0.062216296792030334,
1213
+ "learning_rate": 1.8400000000000002e-06,
1214
+ "loss": 0.0016,
1215
+ "step": 4175
1216
+ },
1217
+ {
1218
+ "epoch": 12.42698295033358,
1219
+ "grad_norm": 0.055066876113414764,
1220
+ "learning_rate": 1.7844444444444444e-06,
1221
+ "loss": 0.0015,
1222
+ "step": 4200
1223
+ },
1224
+ {
1225
+ "epoch": 12.501111934766493,
1226
+ "grad_norm": 0.06531412899494171,
1227
+ "learning_rate": 1.728888888888889e-06,
1228
+ "loss": 0.0015,
1229
+ "step": 4225
1230
+ },
1231
+ {
1232
+ "epoch": 12.575240919199407,
1233
+ "grad_norm": 0.04836783558130264,
1234
+ "learning_rate": 1.6733333333333335e-06,
1235
+ "loss": 0.0016,
1236
+ "step": 4250
1237
+ },
1238
+ {
1239
+ "epoch": 12.649369903632321,
1240
+ "grad_norm": 0.050022318959236145,
1241
+ "learning_rate": 1.6177777777777778e-06,
1242
+ "loss": 0.0016,
1243
+ "step": 4275
1244
+ },
1245
+ {
1246
+ "epoch": 12.723498888065233,
1247
+ "grad_norm": 0.05791894719004631,
1248
+ "learning_rate": 1.5622222222222225e-06,
1249
+ "loss": 0.0015,
1250
+ "step": 4300
1251
+ },
1252
+ {
1253
+ "epoch": 12.797627872498147,
1254
+ "grad_norm": 0.050857629626989365,
1255
+ "learning_rate": 1.506666666666667e-06,
1256
+ "loss": 0.0016,
1257
+ "step": 4325
1258
+ },
1259
+ {
1260
+ "epoch": 12.87175685693106,
1261
+ "grad_norm": 0.05395572632551193,
1262
+ "learning_rate": 1.4511111111111112e-06,
1263
+ "loss": 0.0016,
1264
+ "step": 4350
1265
+ },
1266
+ {
1267
+ "epoch": 12.945885841363973,
1268
+ "grad_norm": 0.06613746285438538,
1269
+ "learning_rate": 1.3955555555555556e-06,
1270
+ "loss": 0.0015,
1271
+ "step": 4375
1272
+ },
1273
+ {
1274
+ "epoch": 13.0177909562639,
1275
+ "grad_norm": 0.048592809587717056,
1276
+ "learning_rate": 1.34e-06,
1277
+ "loss": 0.0014,
1278
+ "step": 4400
1279
+ },
1280
+ {
1281
+ "epoch": 13.091919940696812,
1282
+ "grad_norm": 0.055479902774095535,
1283
+ "learning_rate": 1.2844444444444445e-06,
1284
+ "loss": 0.0014,
1285
+ "step": 4425
1286
+ },
1287
+ {
1288
+ "epoch": 13.166048925129726,
1289
+ "grad_norm": 0.045032110065221786,
1290
+ "learning_rate": 1.228888888888889e-06,
1291
+ "loss": 0.0014,
1292
+ "step": 4450
1293
+ },
1294
+ {
1295
+ "epoch": 13.24017790956264,
1296
+ "grad_norm": 0.06555484235286713,
1297
+ "learning_rate": 1.1733333333333335e-06,
1298
+ "loss": 0.0014,
1299
+ "step": 4475
1300
+ },
1301
+ {
1302
+ "epoch": 13.314306893995552,
1303
+ "grad_norm": 0.048962488770484924,
1304
+ "learning_rate": 1.117777777777778e-06,
1305
+ "loss": 0.0014,
1306
+ "step": 4500
1307
+ },
1308
+ {
1309
+ "epoch": 13.388435878428465,
1310
+ "grad_norm": 0.047078557312488556,
1311
+ "learning_rate": 1.0622222222222222e-06,
1312
+ "loss": 0.0014,
1313
+ "step": 4525
1314
+ },
1315
+ {
1316
+ "epoch": 13.46256486286138,
1317
+ "grad_norm": 0.050719983875751495,
1318
+ "learning_rate": 1.0066666666666668e-06,
1319
+ "loss": 0.0014,
1320
+ "step": 4550
1321
+ },
1322
+ {
1323
+ "epoch": 13.536693847294291,
1324
+ "grad_norm": 0.060897134244441986,
1325
+ "learning_rate": 9.511111111111111e-07,
1326
+ "loss": 0.0014,
1327
+ "step": 4575
1328
+ },
1329
+ {
1330
+ "epoch": 13.610822831727205,
1331
+ "grad_norm": 0.0608171783387661,
1332
+ "learning_rate": 8.955555555555557e-07,
1333
+ "loss": 0.0016,
1334
+ "step": 4600
1335
+ },
1336
+ {
1337
+ "epoch": 13.684951816160119,
1338
+ "grad_norm": 0.050377678126096725,
1339
+ "learning_rate": 8.400000000000001e-07,
1340
+ "loss": 0.0014,
1341
+ "step": 4625
1342
+ },
1343
+ {
1344
+ "epoch": 13.759080800593033,
1345
+ "grad_norm": 0.04982956126332283,
1346
+ "learning_rate": 7.844444444444445e-07,
1347
+ "loss": 0.0013,
1348
+ "step": 4650
1349
+ },
1350
+ {
1351
+ "epoch": 13.833209785025945,
1352
+ "grad_norm": 0.049559298902750015,
1353
+ "learning_rate": 7.28888888888889e-07,
1354
+ "loss": 0.0014,
1355
+ "step": 4675
1356
+ },
1357
+ {
1358
+ "epoch": 13.907338769458859,
1359
+ "grad_norm": 0.045101020485162735,
1360
+ "learning_rate": 6.733333333333334e-07,
1361
+ "loss": 0.0014,
1362
+ "step": 4700
1363
+ },
1364
+ {
1365
+ "epoch": 13.981467753891772,
1366
+ "grad_norm": 0.04930800572037697,
1367
+ "learning_rate": 6.177777777777778e-07,
1368
+ "loss": 0.0014,
1369
+ "step": 4725
1370
+ },
1371
+ {
1372
+ "epoch": 14.053372868791698,
1373
+ "grad_norm": 0.046362608671188354,
1374
+ "learning_rate": 5.622222222222223e-07,
1375
+ "loss": 0.0013,
1376
+ "step": 4750
1377
+ },
1378
+ {
1379
+ "epoch": 14.127501853224611,
1380
+ "grad_norm": 0.05173536762595177,
1381
+ "learning_rate": 5.066666666666667e-07,
1382
+ "loss": 0.0015,
1383
+ "step": 4775
1384
+ },
1385
+ {
1386
+ "epoch": 14.201630837657524,
1387
+ "grad_norm": 0.045450981706380844,
1388
+ "learning_rate": 4.511111111111111e-07,
1389
+ "loss": 0.0013,
1390
+ "step": 4800
1391
+ },
1392
+ {
1393
+ "epoch": 14.275759822090437,
1394
+ "grad_norm": 0.04437384009361267,
1395
+ "learning_rate": 3.9555555555555557e-07,
1396
+ "loss": 0.0013,
1397
+ "step": 4825
1398
+ },
1399
+ {
1400
+ "epoch": 14.349888806523351,
1401
+ "grad_norm": 0.048076264560222626,
1402
+ "learning_rate": 3.4000000000000003e-07,
1403
+ "loss": 0.0013,
1404
+ "step": 4850
1405
+ },
1406
+ {
1407
+ "epoch": 14.424017790956263,
1408
+ "grad_norm": 0.047441426664590836,
1409
+ "learning_rate": 2.844444444444445e-07,
1410
+ "loss": 0.0014,
1411
+ "step": 4875
1412
+ },
1413
+ {
1414
+ "epoch": 14.498146775389177,
1415
+ "grad_norm": 0.04609740898013115,
1416
+ "learning_rate": 2.2888888888888892e-07,
1417
+ "loss": 0.0013,
1418
+ "step": 4900
1419
+ },
1420
+ {
1421
+ "epoch": 14.572275759822091,
1422
+ "grad_norm": 0.05346500128507614,
1423
+ "learning_rate": 1.7333333333333335e-07,
1424
+ "loss": 0.0013,
1425
+ "step": 4925
1426
+ },
1427
+ {
1428
+ "epoch": 14.646404744255003,
1429
+ "grad_norm": 0.04578370600938797,
1430
+ "learning_rate": 1.1777777777777778e-07,
1431
+ "loss": 0.0013,
1432
+ "step": 4950
1433
+ },
1434
+ {
1435
+ "epoch": 14.720533728687917,
1436
+ "grad_norm": 0.045380473136901855,
1437
+ "learning_rate": 6.222222222222223e-08,
1438
+ "loss": 0.0014,
1439
+ "step": 4975
1440
+ },
1441
+ {
1442
+ "epoch": 14.79466271312083,
1443
+ "grad_norm": 0.04897478222846985,
1444
+ "learning_rate": 6.666666666666667e-09,
1445
+ "loss": 0.0013,
1446
+ "step": 5000
1447
+ },
1448
+ {
1449
+ "epoch": 14.79466271312083,
1450
+ "eval_loss": 0.4044080376625061,
1451
+ "eval_runtime": 939.8085,
1452
+ "eval_samples_per_second": 3.07,
1453
+ "eval_steps_per_second": 0.384,
1454
+ "eval_wer": 0.1675184165997231,
1455
+ "step": 5000
1456
+ },
1457
+ {
1458
+ "epoch": 14.79466271312083,
1459
+ "step": 5000,
1460
+ "total_flos": 4.607669973179538e+19,
1461
+ "train_loss": 0.11535389684215187,
1462
+ "train_runtime": 25285.0125,
1463
+ "train_samples_per_second": 6.328,
1464
+ "train_steps_per_second": 0.198
1465
  }
1466
  ],
1467
  "logging_steps": 25,
1468
+ "max_steps": 5000,
1469
  "num_input_tokens_seen": 0,
1470
+ "num_train_epochs": 15,
1471
  "save_steps": 1000,
1472
  "stateful_callbacks": {
1473
  "TrainerControl": {
 
1481
  "attributes": {}
1482
  }
1483
  },
1484
+ "total_flos": 4.607669973179538e+19,
1485
  "train_batch_size": 4,
1486
  "trial_name": null,
1487
  "trial_params": null