rbcurzon commited on
Commit
8e56c9c
·
verified ·
1 Parent(s): 76194b9

End of training

Browse files
README.md CHANGED
@@ -4,11 +4,24 @@ license: apache-2.0
4
  base_model: openai/whisper-medium
5
  tags:
6
  - generated_from_trainer
 
 
7
  metrics:
8
  - wer
9
  model-index:
10
  - name: whisper-medium-ph
11
- results: []
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -16,7 +29,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # whisper-medium-ph
18
 
19
- This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.2901
22
  - Wer: 0.1147
 
4
  base_model: openai/whisper-medium
5
  tags:
6
  - generated_from_trainer
7
+ datasets:
8
+ - rbcurzon/ph_dialect_asr
9
  metrics:
10
  - wer
11
  model-index:
12
  - name: whisper-medium-ph
13
+ results:
14
+ - task:
15
+ name: Automatic Speech Recognition
16
+ type: automatic-speech-recognition
17
+ dataset:
18
+ name: rbcurzon/ph_dialect_asr all
19
+ type: rbcurzon/ph_dialect_asr
20
+ args: all
21
+ metrics:
22
+ - name: Wer
23
+ type: wer
24
+ value: 0.1146545827633379
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
29
 
30
  # whisper-medium-ph
31
 
32
+ This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on the rbcurzon/ph_dialect_asr all dataset.
33
  It achieves the following results on the evaluation set:
34
  - Loss: 0.2901
35
  - Wer: 0.1147
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 2.466091245376079,
3
- "eval_loss": 0.3112793266773224,
4
- "eval_runtime": 1173.9261,
5
- "eval_samples": 3612,
6
- "eval_samples_per_second": 3.077,
7
- "eval_steps_per_second": 0.193,
8
- "eval_wer": 0.12829864835872132,
9
- "total_flos": 3.265323341119488e+19,
10
- "train_loss": 0.2021937195956707,
11
- "train_runtime": 9527.7148,
12
- "train_samples": 12973,
13
- "train_samples_per_second": 3.359,
14
- "train_steps_per_second": 0.21
15
  }
 
1
  {
2
+ "epoch": 7.407709414381023,
3
+ "eval_loss": 0.29011788964271545,
4
+ "eval_runtime": 736.3871,
5
+ "eval_samples": 2885,
6
+ "eval_samples_per_second": 3.918,
7
+ "eval_steps_per_second": 0.49,
8
+ "eval_wer": 0.1146545827633379,
9
+ "total_flos": 8.155551755501568e+19,
10
+ "train_loss": 0.10907779041565954,
11
+ "train_runtime": 12394.4337,
12
+ "train_samples": 10787,
13
+ "train_samples_per_second": 6.455,
14
+ "train_steps_per_second": 0.403
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.466091245376079,
3
- "eval_loss": 0.3112793266773224,
4
- "eval_runtime": 1173.9261,
5
- "eval_samples": 3612,
6
- "eval_samples_per_second": 3.077,
7
- "eval_steps_per_second": 0.193,
8
- "eval_wer": 0.12829864835872132
9
  }
 
1
  {
2
+ "epoch": 7.407709414381023,
3
+ "eval_loss": 0.29011788964271545,
4
+ "eval_runtime": 736.3871,
5
+ "eval_samples": 2885,
6
+ "eval_samples_per_second": 3.918,
7
+ "eval_steps_per_second": 0.49,
8
+ "eval_wer": 0.1146545827633379
9
  }
runs/Aug09_05-15-33_30e2a19b1988/events.out.tfevents.1754729893.30e2a19b1988.3712.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fc589040fcd11263525855db0ac5e3ae941083ed659ce852614b4c40555680d
3
+ size 406
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.466091245376079,
3
- "total_flos": 3.265323341119488e+19,
4
- "train_loss": 0.2021937195956707,
5
- "train_runtime": 9527.7148,
6
- "train_samples": 12973,
7
- "train_samples_per_second": 3.359,
8
- "train_steps_per_second": 0.21
9
  }
 
1
  {
2
+ "epoch": 7.407709414381023,
3
+ "total_flos": 8.155551755501568e+19,
4
+ "train_loss": 0.10907779041565954,
5
+ "train_runtime": 12394.4337,
6
+ "train_samples": 10787,
7
+ "train_samples_per_second": 6.455,
8
+ "train_steps_per_second": 0.403
9
  }
trainer_state.json CHANGED
@@ -2,605 +2,1472 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.466091245376079,
6
  "eval_steps": 1000,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.030826140567200986,
14
- "grad_norm": 11.499724388122559,
15
- "learning_rate": 4.4e-07,
16
- "loss": 1.2454,
17
  "step": 25
18
  },
19
  {
20
- "epoch": 0.06165228113440197,
21
- "grad_norm": 8.96716594696045,
22
- "learning_rate": 9.400000000000001e-07,
23
- "loss": 1.0189,
24
  "step": 50
25
  },
26
  {
27
- "epoch": 0.09247842170160296,
28
- "grad_norm": 7.870485305786133,
29
- "learning_rate": 1.44e-06,
30
- "loss": 0.8525,
31
  "step": 75
32
  },
33
  {
34
- "epoch": 0.12330456226880394,
35
- "grad_norm": 6.292770862579346,
36
- "learning_rate": 1.94e-06,
37
- "loss": 0.6987,
38
  "step": 100
39
  },
40
  {
41
- "epoch": 0.15413070283600494,
42
- "grad_norm": 6.656809329986572,
43
- "learning_rate": 2.4400000000000004e-06,
44
- "loss": 0.6285,
45
  "step": 125
46
  },
47
  {
48
- "epoch": 0.18495684340320592,
49
- "grad_norm": 6.315510272979736,
50
- "learning_rate": 2.9400000000000002e-06,
51
- "loss": 0.6055,
52
  "step": 150
53
  },
54
  {
55
- "epoch": 0.2157829839704069,
56
- "grad_norm": 7.333935737609863,
57
- "learning_rate": 3.44e-06,
58
- "loss": 0.5517,
59
  "step": 175
60
  },
61
  {
62
- "epoch": 0.2466091245376079,
63
- "grad_norm": 7.557521820068359,
64
- "learning_rate": 3.94e-06,
65
- "loss": 0.4966,
66
  "step": 200
67
  },
68
  {
69
- "epoch": 0.27743526510480887,
70
- "grad_norm": 5.930201053619385,
71
- "learning_rate": 4.440000000000001e-06,
72
- "loss": 0.4821,
73
  "step": 225
74
  },
75
  {
76
- "epoch": 0.3082614056720099,
77
- "grad_norm": 6.367243766784668,
78
- "learning_rate": 4.94e-06,
79
- "loss": 0.4661,
80
  "step": 250
81
  },
82
  {
83
- "epoch": 0.33908754623921084,
84
- "grad_norm": 6.376411437988281,
85
- "learning_rate": 5.4400000000000004e-06,
86
- "loss": 0.4417,
87
  "step": 275
88
  },
89
  {
90
- "epoch": 0.36991368680641185,
91
- "grad_norm": 5.1631574630737305,
92
- "learning_rate": 5.94e-06,
93
- "loss": 0.4297,
94
  "step": 300
95
  },
96
  {
97
- "epoch": 0.4007398273736128,
98
- "grad_norm": 5.0071330070495605,
99
- "learning_rate": 6.440000000000001e-06,
100
- "loss": 0.3914,
101
  "step": 325
102
  },
103
  {
104
- "epoch": 0.4315659679408138,
105
- "grad_norm": 4.759220123291016,
106
- "learning_rate": 6.9400000000000005e-06,
107
- "loss": 0.3548,
108
  "step": 350
109
  },
110
  {
111
- "epoch": 0.4623921085080148,
112
- "grad_norm": 5.040701389312744,
113
- "learning_rate": 7.440000000000001e-06,
114
- "loss": 0.3546,
115
  "step": 375
116
  },
117
  {
118
- "epoch": 0.4932182490752158,
119
- "grad_norm": 5.04915189743042,
120
- "learning_rate": 7.94e-06,
121
- "loss": 0.3388,
122
  "step": 400
123
  },
124
  {
125
- "epoch": 0.5240443896424167,
126
- "grad_norm": 5.2604546546936035,
127
- "learning_rate": 8.44e-06,
128
- "loss": 0.3048,
129
  "step": 425
130
  },
131
  {
132
- "epoch": 0.5548705302096177,
133
- "grad_norm": 5.409047603607178,
134
- "learning_rate": 8.94e-06,
135
- "loss": 0.3204,
136
  "step": 450
137
  },
138
  {
139
- "epoch": 0.5856966707768188,
140
- "grad_norm": 7.095681190490723,
141
- "learning_rate": 9.440000000000001e-06,
142
- "loss": 0.2962,
143
  "step": 475
144
  },
145
  {
146
- "epoch": 0.6165228113440198,
147
- "grad_norm": 5.518855571746826,
148
- "learning_rate": 9.940000000000001e-06,
149
- "loss": 0.2854,
150
  "step": 500
151
  },
152
  {
153
- "epoch": 0.6473489519112207,
154
- "grad_norm": 5.083781719207764,
155
- "learning_rate": 9.853333333333334e-06,
156
- "loss": 0.2816,
157
  "step": 525
158
  },
159
  {
160
- "epoch": 0.6781750924784217,
161
- "grad_norm": 4.599233150482178,
162
- "learning_rate": 9.686666666666668e-06,
163
- "loss": 0.2804,
164
  "step": 550
165
  },
166
  {
167
- "epoch": 0.7090012330456227,
168
- "grad_norm": 5.882887840270996,
169
- "learning_rate": 9.52e-06,
170
- "loss": 0.254,
171
  "step": 575
172
  },
173
  {
174
- "epoch": 0.7398273736128237,
175
- "grad_norm": 3.953178644180298,
176
- "learning_rate": 9.353333333333334e-06,
177
- "loss": 0.2522,
178
  "step": 600
179
  },
180
  {
181
- "epoch": 0.7706535141800247,
182
- "grad_norm": 3.6629250049591064,
183
- "learning_rate": 9.186666666666666e-06,
184
- "loss": 0.2217,
185
  "step": 625
186
  },
187
  {
188
- "epoch": 0.8014796547472256,
189
- "grad_norm": 5.168231010437012,
190
- "learning_rate": 9.020000000000002e-06,
191
- "loss": 0.2451,
192
  "step": 650
193
  },
194
  {
195
- "epoch": 0.8323057953144266,
196
- "grad_norm": 4.211165904998779,
197
- "learning_rate": 8.853333333333334e-06,
198
- "loss": 0.216,
199
  "step": 675
200
  },
201
  {
202
- "epoch": 0.8631319358816276,
203
- "grad_norm": 4.873836994171143,
204
- "learning_rate": 8.686666666666668e-06,
205
- "loss": 0.2027,
206
  "step": 700
207
  },
208
  {
209
- "epoch": 0.8939580764488286,
210
- "grad_norm": 4.061721324920654,
211
- "learning_rate": 8.52e-06,
212
- "loss": 0.2184,
213
  "step": 725
214
  },
215
  {
216
- "epoch": 0.9247842170160296,
217
- "grad_norm": 5.536831855773926,
218
- "learning_rate": 8.353333333333335e-06,
219
- "loss": 0.2212,
220
  "step": 750
221
  },
222
  {
223
- "epoch": 0.9556103575832305,
224
- "grad_norm": 3.8625235557556152,
225
- "learning_rate": 8.186666666666667e-06,
226
- "loss": 0.1874,
227
  "step": 775
228
  },
229
  {
230
- "epoch": 0.9864364981504316,
231
- "grad_norm": 4.030850887298584,
232
- "learning_rate": 8.020000000000001e-06,
233
- "loss": 0.1962,
234
  "step": 800
235
  },
236
  {
237
- "epoch": 1.0172626387176325,
238
- "grad_norm": 3.3085479736328125,
239
- "learning_rate": 7.853333333333333e-06,
240
- "loss": 0.1508,
241
  "step": 825
242
  },
243
  {
244
- "epoch": 1.0480887792848335,
245
- "grad_norm": 3.4279379844665527,
246
- "learning_rate": 7.686666666666667e-06,
247
- "loss": 0.0925,
248
  "step": 850
249
  },
250
  {
251
- "epoch": 1.0789149198520345,
252
- "grad_norm": 3.8404757976531982,
253
- "learning_rate": 7.520000000000001e-06,
254
- "loss": 0.1182,
255
  "step": 875
256
  },
257
  {
258
- "epoch": 1.1097410604192355,
259
- "grad_norm": 3.7630670070648193,
260
- "learning_rate": 7.353333333333334e-06,
261
- "loss": 0.1136,
262
  "step": 900
263
  },
264
  {
265
- "epoch": 1.1405672009864365,
266
- "grad_norm": 3.491826295852661,
267
- "learning_rate": 7.186666666666668e-06,
268
- "loss": 0.1069,
269
  "step": 925
270
  },
271
  {
272
- "epoch": 1.1713933415536375,
273
- "grad_norm": 2.2027463912963867,
274
- "learning_rate": 7.0200000000000006e-06,
275
- "loss": 0.1048,
276
  "step": 950
277
  },
278
  {
279
- "epoch": 1.2022194821208385,
280
- "grad_norm": 4.981961250305176,
281
- "learning_rate": 6.853333333333334e-06,
282
- "loss": 0.1141,
283
  "step": 975
284
  },
285
  {
286
- "epoch": 1.2330456226880395,
287
- "grad_norm": 3.4486851692199707,
288
- "learning_rate": 6.6866666666666665e-06,
289
- "loss": 0.1001,
290
  "step": 1000
291
  },
292
  {
293
- "epoch": 1.2330456226880395,
294
- "eval_loss": 0.30403050780296326,
295
- "eval_runtime": 1181.2571,
296
- "eval_samples_per_second": 3.058,
297
- "eval_steps_per_second": 0.191,
298
- "eval_wer": 0.14332880402393383,
299
  "step": 1000
300
  },
301
  {
302
- "epoch": 1.2638717632552403,
303
- "grad_norm": 2.620025396347046,
304
- "learning_rate": 6.520000000000001e-06,
305
- "loss": 0.0916,
306
  "step": 1025
307
  },
308
  {
309
- "epoch": 1.2946979038224415,
310
- "grad_norm": 3.629256248474121,
311
- "learning_rate": 6.353333333333333e-06,
312
- "loss": 0.0949,
313
  "step": 1050
314
  },
315
  {
316
- "epoch": 1.3255240443896423,
317
- "grad_norm": 2.832113265991211,
318
- "learning_rate": 6.186666666666668e-06,
319
- "loss": 0.084,
320
  "step": 1075
321
  },
322
  {
323
- "epoch": 1.3563501849568433,
324
- "grad_norm": 3.5080323219299316,
325
- "learning_rate": 6.02e-06,
326
- "loss": 0.0881,
327
  "step": 1100
328
  },
329
  {
330
- "epoch": 1.3871763255240444,
331
- "grad_norm": 3.9893321990966797,
332
- "learning_rate": 5.853333333333335e-06,
333
- "loss": 0.0781,
334
  "step": 1125
335
  },
336
  {
337
- "epoch": 1.4180024660912454,
338
- "grad_norm": 2.786031723022461,
339
- "learning_rate": 5.686666666666667e-06,
340
- "loss": 0.0852,
341
  "step": 1150
342
  },
343
  {
344
- "epoch": 1.4488286066584464,
345
- "grad_norm": 2.333205461502075,
346
- "learning_rate": 5.5200000000000005e-06,
347
- "loss": 0.0759,
348
  "step": 1175
349
  },
350
  {
351
- "epoch": 1.4796547472256474,
352
- "grad_norm": 3.204261302947998,
353
- "learning_rate": 5.3533333333333335e-06,
354
- "loss": 0.0865,
355
  "step": 1200
356
  },
357
  {
358
- "epoch": 1.5104808877928484,
359
- "grad_norm": 3.2963826656341553,
360
- "learning_rate": 5.186666666666667e-06,
361
- "loss": 0.0757,
362
  "step": 1225
363
  },
364
  {
365
- "epoch": 1.5413070283600492,
366
- "grad_norm": 2.5825254917144775,
367
- "learning_rate": 5.02e-06,
368
- "loss": 0.0717,
369
  "step": 1250
370
  },
371
  {
372
- "epoch": 1.5721331689272504,
373
- "grad_norm": 2.7192881107330322,
374
- "learning_rate": 4.853333333333334e-06,
375
- "loss": 0.0722,
376
  "step": 1275
377
  },
378
  {
379
- "epoch": 1.6029593094944512,
380
- "grad_norm": 2.911716938018799,
381
- "learning_rate": 4.686666666666667e-06,
382
- "loss": 0.0757,
383
  "step": 1300
384
  },
385
  {
386
- "epoch": 1.6337854500616524,
387
- "grad_norm": 2.1598548889160156,
388
- "learning_rate": 4.520000000000001e-06,
389
- "loss": 0.0704,
390
  "step": 1325
391
  },
392
  {
393
- "epoch": 1.6646115906288532,
394
- "grad_norm": 2.1722934246063232,
395
- "learning_rate": 4.353333333333334e-06,
396
- "loss": 0.0621,
397
  "step": 1350
398
  },
399
  {
400
- "epoch": 1.6954377311960542,
401
- "grad_norm": 2.0885589122772217,
402
- "learning_rate": 4.1866666666666675e-06,
403
- "loss": 0.0737,
404
  "step": 1375
405
  },
406
  {
407
- "epoch": 1.7262638717632552,
408
- "grad_norm": 3.2038285732269287,
409
- "learning_rate": 4.0200000000000005e-06,
410
- "loss": 0.0729,
411
  "step": 1400
412
  },
413
  {
414
- "epoch": 1.7570900123304563,
415
- "grad_norm": 2.2708075046539307,
416
- "learning_rate": 3.853333333333334e-06,
417
- "loss": 0.0571,
418
  "step": 1425
419
  },
420
  {
421
- "epoch": 1.7879161528976573,
422
- "grad_norm": 1.496151089668274,
423
- "learning_rate": 3.686666666666667e-06,
424
- "loss": 0.0532,
425
  "step": 1450
426
  },
427
  {
428
- "epoch": 1.818742293464858,
429
- "grad_norm": 1.8642607927322388,
430
- "learning_rate": 3.52e-06,
431
- "loss": 0.0544,
432
  "step": 1475
433
  },
434
  {
435
- "epoch": 1.8495684340320593,
436
- "grad_norm": 2.1302435398101807,
437
- "learning_rate": 3.3533333333333336e-06,
438
- "loss": 0.0425,
439
  "step": 1500
440
  },
441
  {
442
- "epoch": 1.88039457459926,
443
- "grad_norm": 1.2720330953598022,
444
- "learning_rate": 3.186666666666667e-06,
445
- "loss": 0.0486,
446
  "step": 1525
447
  },
448
  {
449
- "epoch": 1.9112207151664613,
450
- "grad_norm": 1.3626000881195068,
451
- "learning_rate": 3.0200000000000003e-06,
452
- "loss": 0.0482,
453
  "step": 1550
454
  },
455
  {
456
- "epoch": 1.942046855733662,
457
- "grad_norm": 2.652956485748291,
458
- "learning_rate": 2.8533333333333337e-06,
459
- "loss": 0.047,
460
  "step": 1575
461
  },
462
  {
463
- "epoch": 1.972872996300863,
464
- "grad_norm": 2.178326368331909,
465
- "learning_rate": 2.686666666666667e-06,
466
- "loss": 0.0543,
467
  "step": 1600
468
  },
469
  {
470
- "epoch": 2.003699136868064,
471
- "grad_norm": 0.6113713979721069,
472
- "learning_rate": 2.52e-06,
473
- "loss": 0.0416,
474
  "step": 1625
475
  },
476
  {
477
- "epoch": 2.034525277435265,
478
- "grad_norm": 1.6302359104156494,
479
- "learning_rate": 2.3533333333333334e-06,
480
- "loss": 0.0167,
481
  "step": 1650
482
  },
483
  {
484
- "epoch": 2.065351418002466,
485
- "grad_norm": 0.9459154605865479,
486
- "learning_rate": 2.1866666666666668e-06,
487
- "loss": 0.0137,
488
  "step": 1675
489
  },
490
  {
491
- "epoch": 2.096177558569667,
492
- "grad_norm": 1.4943691492080688,
493
- "learning_rate": 2.02e-06,
494
- "loss": 0.0159,
495
  "step": 1700
496
  },
497
  {
498
- "epoch": 2.127003699136868,
499
- "grad_norm": 0.5425832867622375,
500
- "learning_rate": 1.8533333333333333e-06,
501
- "loss": 0.0152,
502
  "step": 1725
503
  },
504
  {
505
- "epoch": 2.157829839704069,
506
- "grad_norm": 1.4946790933609009,
507
- "learning_rate": 1.6866666666666667e-06,
508
- "loss": 0.0146,
509
  "step": 1750
510
  },
511
  {
512
- "epoch": 2.18865598027127,
513
- "grad_norm": 0.9100169539451599,
514
- "learning_rate": 1.52e-06,
515
- "loss": 0.0191,
516
  "step": 1775
517
  },
518
  {
519
- "epoch": 2.219482120838471,
520
- "grad_norm": 1.2448313236236572,
521
- "learning_rate": 1.3533333333333334e-06,
522
- "loss": 0.0159,
523
  "step": 1800
524
  },
525
  {
526
- "epoch": 2.250308261405672,
527
- "grad_norm": 0.9976411461830139,
528
- "learning_rate": 1.1866666666666668e-06,
529
- "loss": 0.0203,
530
  "step": 1825
531
  },
532
  {
533
- "epoch": 2.281134401972873,
534
- "grad_norm": 1.358780860900879,
535
- "learning_rate": 1.02e-06,
536
- "loss": 0.0139,
537
  "step": 1850
538
  },
539
  {
540
- "epoch": 2.311960542540074,
541
- "grad_norm": 1.2800226211547852,
542
- "learning_rate": 8.533333333333334e-07,
543
- "loss": 0.0175,
544
  "step": 1875
545
  },
546
  {
547
- "epoch": 2.342786683107275,
548
- "grad_norm": 1.007161021232605,
549
- "learning_rate": 6.866666666666667e-07,
550
- "loss": 0.013,
551
  "step": 1900
552
  },
553
  {
554
- "epoch": 2.373612823674476,
555
- "grad_norm": 1.1838051080703735,
556
- "learning_rate": 5.2e-07,
557
- "loss": 0.0144,
558
  "step": 1925
559
  },
560
  {
561
- "epoch": 2.404438964241677,
562
- "grad_norm": 1.2872673273086548,
563
- "learning_rate": 3.533333333333334e-07,
564
- "loss": 0.0134,
565
  "step": 1950
566
  },
567
  {
568
- "epoch": 2.435265104808878,
569
- "grad_norm": 0.7094443440437317,
570
- "learning_rate": 1.866666666666667e-07,
571
- "loss": 0.0123,
572
  "step": 1975
573
  },
574
  {
575
- "epoch": 2.466091245376079,
576
- "grad_norm": 0.6137486696243286,
577
- "learning_rate": 2e-08,
578
- "loss": 0.0125,
579
  "step": 2000
580
  },
581
  {
582
- "epoch": 2.466091245376079,
583
- "eval_loss": 0.3112793266773224,
584
- "eval_runtime": 1182.7026,
585
- "eval_samples_per_second": 3.054,
586
- "eval_steps_per_second": 0.191,
587
- "eval_wer": 0.12829864835872132,
588
  "step": 2000
589
  },
590
  {
591
- "epoch": 2.466091245376079,
592
- "step": 2000,
593
- "total_flos": 3.265323341119488e+19,
594
- "train_loss": 0.2021937195956707,
595
- "train_runtime": 9527.7148,
596
- "train_samples_per_second": 3.359,
597
- "train_steps_per_second": 0.21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  }
599
  ],
600
  "logging_steps": 25,
601
- "max_steps": 2000,
602
  "num_input_tokens_seen": 0,
603
- "num_train_epochs": 3,
604
  "save_steps": 1000,
605
  "stateful_callbacks": {
606
  "TrainerControl": {
@@ -614,8 +1481,8 @@
614
  "attributes": {}
615
  }
616
  },
617
- "total_flos": 3.265323341119488e+19,
618
- "train_batch_size": 16,
619
  "trial_name": null,
620
  "trial_params": null
621
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 7.407709414381023,
6
  "eval_steps": 1000,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.037064492216456635,
14
+ "grad_norm": 14.615763664245605,
15
+ "learning_rate": 4.800000000000001e-07,
16
+ "loss": 1.4607,
17
  "step": 25
18
  },
19
  {
20
+ "epoch": 0.07412898443291327,
21
+ "grad_norm": 10.59756851196289,
22
+ "learning_rate": 9.800000000000001e-07,
23
+ "loss": 1.176,
24
  "step": 50
25
  },
26
  {
27
+ "epoch": 0.1111934766493699,
28
+ "grad_norm": 7.142136573791504,
29
+ "learning_rate": 1.48e-06,
30
+ "loss": 0.9267,
31
  "step": 75
32
  },
33
  {
34
+ "epoch": 0.14825796886582654,
35
+ "grad_norm": 9.191902160644531,
36
+ "learning_rate": 1.98e-06,
37
+ "loss": 0.7253,
38
  "step": 100
39
  },
40
  {
41
+ "epoch": 0.18532246108228317,
42
+ "grad_norm": 10.320201873779297,
43
+ "learning_rate": 2.4800000000000004e-06,
44
+ "loss": 0.7047,
45
  "step": 125
46
  },
47
  {
48
+ "epoch": 0.2223869532987398,
49
+ "grad_norm": 8.486912727355957,
50
+ "learning_rate": 2.9800000000000003e-06,
51
+ "loss": 0.6634,
52
  "step": 150
53
  },
54
  {
55
+ "epoch": 0.25945144551519644,
56
+ "grad_norm": 9.802300453186035,
57
+ "learning_rate": 3.48e-06,
58
+ "loss": 0.5786,
59
  "step": 175
60
  },
61
  {
62
+ "epoch": 0.2965159377316531,
63
+ "grad_norm": 9.568249702453613,
64
+ "learning_rate": 3.980000000000001e-06,
65
+ "loss": 0.5857,
66
  "step": 200
67
  },
68
  {
69
+ "epoch": 0.3335804299481097,
70
+ "grad_norm": 7.968526840209961,
71
+ "learning_rate": 4.48e-06,
72
+ "loss": 0.5385,
73
  "step": 225
74
  },
75
  {
76
+ "epoch": 0.37064492216456635,
77
+ "grad_norm": 7.507795810699463,
78
+ "learning_rate": 4.980000000000001e-06,
79
+ "loss": 0.5151,
80
  "step": 250
81
  },
82
  {
83
+ "epoch": 0.407709414381023,
84
+ "grad_norm": 6.258375644683838,
85
+ "learning_rate": 5.480000000000001e-06,
86
+ "loss": 0.4649,
87
  "step": 275
88
  },
89
  {
90
+ "epoch": 0.4447739065974796,
91
+ "grad_norm": 9.89697551727295,
92
+ "learning_rate": 5.98e-06,
93
+ "loss": 0.4209,
94
  "step": 300
95
  },
96
  {
97
+ "epoch": 0.48183839881393625,
98
+ "grad_norm": 7.8507490158081055,
99
+ "learning_rate": 6.480000000000001e-06,
100
+ "loss": 0.4459,
101
  "step": 325
102
  },
103
  {
104
+ "epoch": 0.5189028910303929,
105
+ "grad_norm": 5.835811138153076,
106
+ "learning_rate": 6.98e-06,
107
+ "loss": 0.4141,
108
  "step": 350
109
  },
110
  {
111
+ "epoch": 0.5559673832468495,
112
+ "grad_norm": 6.767547607421875,
113
+ "learning_rate": 7.48e-06,
114
+ "loss": 0.4108,
115
  "step": 375
116
  },
117
  {
118
+ "epoch": 0.5930318754633062,
119
+ "grad_norm": 5.9475884437561035,
120
+ "learning_rate": 7.980000000000002e-06,
121
+ "loss": 0.41,
122
  "step": 400
123
  },
124
  {
125
+ "epoch": 0.6300963676797627,
126
+ "grad_norm": 7.767906188964844,
127
+ "learning_rate": 8.48e-06,
128
+ "loss": 0.3781,
129
  "step": 425
130
  },
131
  {
132
+ "epoch": 0.6671608598962194,
133
+ "grad_norm": 6.990137100219727,
134
+ "learning_rate": 8.98e-06,
135
+ "loss": 0.39,
136
  "step": 450
137
  },
138
  {
139
+ "epoch": 0.704225352112676,
140
+ "grad_norm": 5.607441425323486,
141
+ "learning_rate": 9.48e-06,
142
+ "loss": 0.3783,
143
  "step": 475
144
  },
145
  {
146
+ "epoch": 0.7412898443291327,
147
+ "grad_norm": 6.288857936859131,
148
+ "learning_rate": 9.980000000000001e-06,
149
+ "loss": 0.3559,
150
  "step": 500
151
  },
152
  {
153
+ "epoch": 0.7783543365455893,
154
+ "grad_norm": 6.985698699951172,
155
+ "learning_rate": 9.946666666666667e-06,
156
+ "loss": 0.3595,
157
  "step": 525
158
  },
159
  {
160
+ "epoch": 0.815418828762046,
161
+ "grad_norm": 6.037854194641113,
162
+ "learning_rate": 9.891111111111113e-06,
163
+ "loss": 0.3163,
164
  "step": 550
165
  },
166
  {
167
+ "epoch": 0.8524833209785025,
168
+ "grad_norm": 5.8710784912109375,
169
+ "learning_rate": 9.835555555555556e-06,
170
+ "loss": 0.3502,
171
  "step": 575
172
  },
173
  {
174
+ "epoch": 0.8895478131949592,
175
+ "grad_norm": 6.342834949493408,
176
+ "learning_rate": 9.780000000000001e-06,
177
+ "loss": 0.317,
178
  "step": 600
179
  },
180
  {
181
+ "epoch": 0.9266123054114158,
182
+ "grad_norm": 5.589534759521484,
183
+ "learning_rate": 9.724444444444445e-06,
184
+ "loss": 0.3228,
185
  "step": 625
186
  },
187
  {
188
+ "epoch": 0.9636767976278725,
189
+ "grad_norm": 7.743918418884277,
190
+ "learning_rate": 9.66888888888889e-06,
191
+ "loss": 0.3144,
192
  "step": 650
193
  },
194
  {
195
+ "epoch": 1.0,
196
+ "grad_norm": 10.073568344116211,
197
+ "learning_rate": 9.613333333333335e-06,
198
+ "loss": 0.2939,
199
  "step": 675
200
  },
201
  {
202
+ "epoch": 1.0370644922164567,
203
+ "grad_norm": 4.640520095825195,
204
+ "learning_rate": 9.557777777777777e-06,
205
+ "loss": 0.1939,
206
  "step": 700
207
  },
208
  {
209
+ "epoch": 1.0741289844329134,
210
+ "grad_norm": 3.2049508094787598,
211
+ "learning_rate": 9.502222222222223e-06,
212
+ "loss": 0.1929,
213
  "step": 725
214
  },
215
  {
216
+ "epoch": 1.1111934766493698,
217
+ "grad_norm": 3.9065611362457275,
218
+ "learning_rate": 9.446666666666667e-06,
219
+ "loss": 0.1998,
220
  "step": 750
221
  },
222
  {
223
+ "epoch": 1.1482579688658265,
224
+ "grad_norm": 3.7471649646759033,
225
+ "learning_rate": 9.391111111111111e-06,
226
+ "loss": 0.2007,
227
  "step": 775
228
  },
229
  {
230
+ "epoch": 1.1853224610822832,
231
+ "grad_norm": 3.952751874923706,
232
+ "learning_rate": 9.335555555555557e-06,
233
+ "loss": 0.1863,
234
  "step": 800
235
  },
236
  {
237
+ "epoch": 1.2223869532987397,
238
+ "grad_norm": 5.39549446105957,
239
+ "learning_rate": 9.280000000000001e-06,
240
+ "loss": 0.1953,
241
  "step": 825
242
  },
243
  {
244
+ "epoch": 1.2594514455151964,
245
+ "grad_norm": 4.03216552734375,
246
+ "learning_rate": 9.224444444444445e-06,
247
+ "loss": 0.2065,
248
  "step": 850
249
  },
250
  {
251
+ "epoch": 1.296515937731653,
252
+ "grad_norm": 3.854651689529419,
253
+ "learning_rate": 9.168888888888889e-06,
254
+ "loss": 0.1703,
255
  "step": 875
256
  },
257
  {
258
+ "epoch": 1.3335804299481098,
259
+ "grad_norm": 4.835360050201416,
260
+ "learning_rate": 9.113333333333335e-06,
261
+ "loss": 0.1692,
262
  "step": 900
263
  },
264
  {
265
+ "epoch": 1.3706449221645665,
266
+ "grad_norm": 5.247130393981934,
267
+ "learning_rate": 9.057777777777779e-06,
268
+ "loss": 0.1982,
269
  "step": 925
270
  },
271
  {
272
+ "epoch": 1.407709414381023,
273
+ "grad_norm": 3.9537737369537354,
274
+ "learning_rate": 9.002222222222223e-06,
275
+ "loss": 0.1661,
276
  "step": 950
277
  },
278
  {
279
+ "epoch": 1.4447739065974796,
280
+ "grad_norm": 4.887810230255127,
281
+ "learning_rate": 8.946666666666669e-06,
282
+ "loss": 0.1836,
283
  "step": 975
284
  },
285
  {
286
+ "epoch": 1.4818383988139363,
287
+ "grad_norm": 3.6338751316070557,
288
+ "learning_rate": 8.891111111111111e-06,
289
+ "loss": 0.1822,
290
  "step": 1000
291
  },
292
  {
293
+ "epoch": 1.4818383988139363,
294
+ "eval_loss": 0.2655850648880005,
295
+ "eval_runtime": 730.9503,
296
+ "eval_samples_per_second": 3.947,
297
+ "eval_steps_per_second": 0.494,
298
+ "eval_wer": 0.14449384404924762,
299
  "step": 1000
300
  },
301
  {
302
+ "epoch": 1.5189028910303928,
303
+ "grad_norm": 4.078255653381348,
304
+ "learning_rate": 8.835555555555557e-06,
305
+ "loss": 0.1661,
306
  "step": 1025
307
  },
308
  {
309
+ "epoch": 1.5559673832468495,
310
+ "grad_norm": 3.9311952590942383,
311
+ "learning_rate": 8.78e-06,
312
+ "loss": 0.1725,
313
  "step": 1050
314
  },
315
  {
316
+ "epoch": 1.5930318754633062,
317
+ "grad_norm": 4.800196170806885,
318
+ "learning_rate": 8.724444444444445e-06,
319
+ "loss": 0.1704,
320
  "step": 1075
321
  },
322
  {
323
+ "epoch": 1.6300963676797626,
324
+ "grad_norm": 4.550530910491943,
325
+ "learning_rate": 8.66888888888889e-06,
326
+ "loss": 0.1793,
327
  "step": 1100
328
  },
329
  {
330
+ "epoch": 1.6671608598962195,
331
+ "grad_norm": 6.508624076843262,
332
+ "learning_rate": 8.613333333333333e-06,
333
+ "loss": 0.1619,
334
  "step": 1125
335
  },
336
  {
337
+ "epoch": 1.704225352112676,
338
+ "grad_norm": 4.16792106628418,
339
+ "learning_rate": 8.557777777777778e-06,
340
+ "loss": 0.1652,
341
  "step": 1150
342
  },
343
  {
344
+ "epoch": 1.7412898443291327,
345
+ "grad_norm": 4.420657157897949,
346
+ "learning_rate": 8.502222222222223e-06,
347
+ "loss": 0.16,
348
  "step": 1175
349
  },
350
  {
351
+ "epoch": 1.7783543365455894,
352
+ "grad_norm": 4.781569004058838,
353
+ "learning_rate": 8.446666666666668e-06,
354
+ "loss": 0.1695,
355
  "step": 1200
356
  },
357
  {
358
+ "epoch": 1.8154188287620459,
359
+ "grad_norm": 3.877307176589966,
360
+ "learning_rate": 8.391111111111112e-06,
361
+ "loss": 0.1529,
362
  "step": 1225
363
  },
364
  {
365
+ "epoch": 1.8524833209785025,
366
+ "grad_norm": 4.159163475036621,
367
+ "learning_rate": 8.335555555555556e-06,
368
+ "loss": 0.1619,
369
  "step": 1250
370
  },
371
  {
372
+ "epoch": 1.8895478131949592,
373
+ "grad_norm": 3.6631579399108887,
374
+ "learning_rate": 8.28e-06,
375
+ "loss": 0.1654,
376
  "step": 1275
377
  },
378
  {
379
+ "epoch": 1.9266123054114157,
380
+ "grad_norm": 4.1784210205078125,
381
+ "learning_rate": 8.224444444444444e-06,
382
+ "loss": 0.1494,
383
  "step": 1300
384
  },
385
  {
386
+ "epoch": 1.9636767976278726,
387
+ "grad_norm": 5.867852210998535,
388
+ "learning_rate": 8.16888888888889e-06,
389
+ "loss": 0.1443,
390
  "step": 1325
391
  },
392
  {
393
+ "epoch": 2.0,
394
+ "grad_norm": 5.817214012145996,
395
+ "learning_rate": 8.113333333333334e-06,
396
+ "loss": 0.139,
397
  "step": 1350
398
  },
399
  {
400
+ "epoch": 2.0370644922164565,
401
+ "grad_norm": 2.3572022914886475,
402
+ "learning_rate": 8.057777777777778e-06,
403
+ "loss": 0.0614,
404
  "step": 1375
405
  },
406
  {
407
+ "epoch": 2.0741289844329134,
408
+ "grad_norm": 2.2769412994384766,
409
+ "learning_rate": 8.002222222222222e-06,
410
+ "loss": 0.0606,
411
  "step": 1400
412
  },
413
  {
414
+ "epoch": 2.11119347664937,
415
+ "grad_norm": 2.474583864212036,
416
+ "learning_rate": 7.946666666666666e-06,
417
+ "loss": 0.0716,
418
  "step": 1425
419
  },
420
  {
421
+ "epoch": 2.1482579688658268,
422
+ "grad_norm": 2.5783841609954834,
423
+ "learning_rate": 7.891111111111112e-06,
424
+ "loss": 0.065,
425
  "step": 1450
426
  },
427
  {
428
+ "epoch": 2.1853224610822832,
429
+ "grad_norm": 1.6132420301437378,
430
+ "learning_rate": 7.835555555555556e-06,
431
+ "loss": 0.067,
432
  "step": 1475
433
  },
434
  {
435
+ "epoch": 2.2223869532987397,
436
+ "grad_norm": 3.8042001724243164,
437
+ "learning_rate": 7.78e-06,
438
+ "loss": 0.0724,
439
  "step": 1500
440
  },
441
  {
442
+ "epoch": 2.2594514455151966,
443
+ "grad_norm": 2.2419843673706055,
444
+ "learning_rate": 7.724444444444446e-06,
445
+ "loss": 0.0761,
446
  "step": 1525
447
  },
448
  {
449
+ "epoch": 2.296515937731653,
450
+ "grad_norm": 2.706354856491089,
451
+ "learning_rate": 7.66888888888889e-06,
452
+ "loss": 0.0659,
453
  "step": 1550
454
  },
455
  {
456
+ "epoch": 2.3335804299481095,
457
+ "grad_norm": 2.8394265174865723,
458
+ "learning_rate": 7.613333333333334e-06,
459
+ "loss": 0.0688,
460
  "step": 1575
461
  },
462
  {
463
+ "epoch": 2.3706449221645665,
464
+ "grad_norm": 2.383784770965576,
465
+ "learning_rate": 7.557777777777779e-06,
466
+ "loss": 0.0729,
467
  "step": 1600
468
  },
469
  {
470
+ "epoch": 2.407709414381023,
471
+ "grad_norm": 3.0959832668304443,
472
+ "learning_rate": 7.502222222222223e-06,
473
+ "loss": 0.0626,
474
  "step": 1625
475
  },
476
  {
477
+ "epoch": 2.4447739065974794,
478
+ "grad_norm": 2.927393913269043,
479
+ "learning_rate": 7.446666666666668e-06,
480
+ "loss": 0.0677,
481
  "step": 1650
482
  },
483
  {
484
+ "epoch": 2.4818383988139363,
485
+ "grad_norm": 2.644434928894043,
486
+ "learning_rate": 7.3911111111111125e-06,
487
+ "loss": 0.0644,
488
  "step": 1675
489
  },
490
  {
491
+ "epoch": 2.5189028910303928,
492
+ "grad_norm": 2.9071755409240723,
493
+ "learning_rate": 7.335555555555556e-06,
494
+ "loss": 0.061,
495
  "step": 1700
496
  },
497
  {
498
+ "epoch": 2.5559673832468492,
499
+ "grad_norm": 2.6862034797668457,
500
+ "learning_rate": 7.280000000000001e-06,
501
+ "loss": 0.0615,
502
  "step": 1725
503
  },
504
  {
505
+ "epoch": 2.593031875463306,
506
+ "grad_norm": 3.1184046268463135,
507
+ "learning_rate": 7.224444444444445e-06,
508
+ "loss": 0.0714,
509
  "step": 1750
510
  },
511
  {
512
+ "epoch": 2.6300963676797626,
513
+ "grad_norm": 1.7592053413391113,
514
+ "learning_rate": 7.1688888888888895e-06,
515
+ "loss": 0.0704,
516
  "step": 1775
517
  },
518
  {
519
+ "epoch": 2.6671608598962195,
520
+ "grad_norm": 2.9316508769989014,
521
+ "learning_rate": 7.113333333333334e-06,
522
+ "loss": 0.0689,
523
  "step": 1800
524
  },
525
  {
526
+ "epoch": 2.704225352112676,
527
+ "grad_norm": 2.1934666633605957,
528
+ "learning_rate": 7.057777777777778e-06,
529
+ "loss": 0.0721,
530
  "step": 1825
531
  },
532
  {
533
+ "epoch": 2.741289844329133,
534
+ "grad_norm": 3.4919371604919434,
535
+ "learning_rate": 7.0022222222222225e-06,
536
+ "loss": 0.0638,
537
  "step": 1850
538
  },
539
  {
540
+ "epoch": 2.7783543365455894,
541
+ "grad_norm": 2.723252058029175,
542
+ "learning_rate": 6.946666666666667e-06,
543
+ "loss": 0.0598,
544
  "step": 1875
545
  },
546
  {
547
+ "epoch": 2.815418828762046,
548
+ "grad_norm": 1.8668267726898193,
549
+ "learning_rate": 6.891111111111111e-06,
550
+ "loss": 0.0607,
551
  "step": 1900
552
  },
553
  {
554
+ "epoch": 2.8524833209785028,
555
+ "grad_norm": 2.0989866256713867,
556
+ "learning_rate": 6.835555555555556e-06,
557
+ "loss": 0.0821,
558
  "step": 1925
559
  },
560
  {
561
+ "epoch": 2.8895478131949592,
562
+ "grad_norm": 2.9375364780426025,
563
+ "learning_rate": 6.780000000000001e-06,
564
+ "loss": 0.0636,
565
  "step": 1950
566
  },
567
  {
568
+ "epoch": 2.9266123054114157,
569
+ "grad_norm": 2.1375315189361572,
570
+ "learning_rate": 6.724444444444444e-06,
571
+ "loss": 0.0723,
572
  "step": 1975
573
  },
574
  {
575
+ "epoch": 2.9636767976278726,
576
+ "grad_norm": 2.5874264240264893,
577
+ "learning_rate": 6.668888888888889e-06,
578
+ "loss": 0.0706,
579
  "step": 2000
580
  },
581
  {
582
+ "epoch": 2.9636767976278726,
583
+ "eval_loss": 0.2490690052509308,
584
+ "eval_runtime": 730.2087,
585
+ "eval_samples_per_second": 3.951,
586
+ "eval_steps_per_second": 0.494,
587
+ "eval_wer": 0.12696648426812585,
588
  "step": 2000
589
  },
590
  {
591
+ "epoch": 3.0,
592
+ "grad_norm": 6.509148597717285,
593
+ "learning_rate": 6.613333333333334e-06,
594
+ "loss": 0.0587,
595
+ "step": 2025
596
+ },
597
+ {
598
+ "epoch": 3.0370644922164565,
599
+ "grad_norm": 1.9590086936950684,
600
+ "learning_rate": 6.557777777777778e-06,
601
+ "loss": 0.0241,
602
+ "step": 2050
603
+ },
604
+ {
605
+ "epoch": 3.0741289844329134,
606
+ "grad_norm": 1.4612740278244019,
607
+ "learning_rate": 6.502222222222223e-06,
608
+ "loss": 0.0267,
609
+ "step": 2075
610
+ },
611
+ {
612
+ "epoch": 3.11119347664937,
613
+ "grad_norm": 0.9522780179977417,
614
+ "learning_rate": 6.446666666666668e-06,
615
+ "loss": 0.023,
616
+ "step": 2100
617
+ },
618
+ {
619
+ "epoch": 3.1482579688658268,
620
+ "grad_norm": 1.891400694847107,
621
+ "learning_rate": 6.391111111111111e-06,
622
+ "loss": 0.0281,
623
+ "step": 2125
624
+ },
625
+ {
626
+ "epoch": 3.1853224610822832,
627
+ "grad_norm": 1.0783302783966064,
628
+ "learning_rate": 6.335555555555556e-06,
629
+ "loss": 0.0246,
630
+ "step": 2150
631
+ },
632
+ {
633
+ "epoch": 3.2223869532987397,
634
+ "grad_norm": 1.3504562377929688,
635
+ "learning_rate": 6.280000000000001e-06,
636
+ "loss": 0.0244,
637
+ "step": 2175
638
+ },
639
+ {
640
+ "epoch": 3.2594514455151966,
641
+ "grad_norm": 1.8768439292907715,
642
+ "learning_rate": 6.224444444444445e-06,
643
+ "loss": 0.0264,
644
+ "step": 2200
645
+ },
646
+ {
647
+ "epoch": 3.296515937731653,
648
+ "grad_norm": 1.5083887577056885,
649
+ "learning_rate": 6.16888888888889e-06,
650
+ "loss": 0.0248,
651
+ "step": 2225
652
+ },
653
+ {
654
+ "epoch": 3.3335804299481095,
655
+ "grad_norm": 3.5768120288848877,
656
+ "learning_rate": 6.113333333333333e-06,
657
+ "loss": 0.0316,
658
+ "step": 2250
659
+ },
660
+ {
661
+ "epoch": 3.3706449221645665,
662
+ "grad_norm": 1.1493444442749023,
663
+ "learning_rate": 6.057777777777778e-06,
664
+ "loss": 0.0294,
665
+ "step": 2275
666
+ },
667
+ {
668
+ "epoch": 3.407709414381023,
669
+ "grad_norm": 2.3746306896209717,
670
+ "learning_rate": 6.002222222222223e-06,
671
+ "loss": 0.0263,
672
+ "step": 2300
673
+ },
674
+ {
675
+ "epoch": 3.4447739065974794,
676
+ "grad_norm": 2.144634485244751,
677
+ "learning_rate": 5.946666666666668e-06,
678
+ "loss": 0.0348,
679
+ "step": 2325
680
+ },
681
+ {
682
+ "epoch": 3.4818383988139363,
683
+ "grad_norm": 1.5002686977386475,
684
+ "learning_rate": 5.891111111111112e-06,
685
+ "loss": 0.0228,
686
+ "step": 2350
687
+ },
688
+ {
689
+ "epoch": 3.5189028910303928,
690
+ "grad_norm": 1.6059187650680542,
691
+ "learning_rate": 5.8355555555555565e-06,
692
+ "loss": 0.0239,
693
+ "step": 2375
694
+ },
695
+ {
696
+ "epoch": 3.5559673832468492,
697
+ "grad_norm": 2.757420778274536,
698
+ "learning_rate": 5.78e-06,
699
+ "loss": 0.0277,
700
+ "step": 2400
701
+ },
702
+ {
703
+ "epoch": 3.593031875463306,
704
+ "grad_norm": 1.3977222442626953,
705
+ "learning_rate": 5.724444444444445e-06,
706
+ "loss": 0.0224,
707
+ "step": 2425
708
+ },
709
+ {
710
+ "epoch": 3.6300963676797626,
711
+ "grad_norm": 1.9618048667907715,
712
+ "learning_rate": 5.6688888888888895e-06,
713
+ "loss": 0.026,
714
+ "step": 2450
715
+ },
716
+ {
717
+ "epoch": 3.6671608598962195,
718
+ "grad_norm": 0.898245632648468,
719
+ "learning_rate": 5.613333333333334e-06,
720
+ "loss": 0.0326,
721
+ "step": 2475
722
+ },
723
+ {
724
+ "epoch": 3.704225352112676,
725
+ "grad_norm": 1.8148616552352905,
726
+ "learning_rate": 5.557777777777778e-06,
727
+ "loss": 0.0213,
728
+ "step": 2500
729
+ },
730
+ {
731
+ "epoch": 3.741289844329133,
732
+ "grad_norm": 1.308030366897583,
733
+ "learning_rate": 5.5022222222222224e-06,
734
+ "loss": 0.0192,
735
+ "step": 2525
736
+ },
737
+ {
738
+ "epoch": 3.7783543365455894,
739
+ "grad_norm": 1.6680744886398315,
740
+ "learning_rate": 5.4466666666666665e-06,
741
+ "loss": 0.027,
742
+ "step": 2550
743
+ },
744
+ {
745
+ "epoch": 3.815418828762046,
746
+ "grad_norm": 3.235917568206787,
747
+ "learning_rate": 5.391111111111111e-06,
748
+ "loss": 0.0242,
749
+ "step": 2575
750
+ },
751
+ {
752
+ "epoch": 3.8524833209785028,
753
+ "grad_norm": 2.096780300140381,
754
+ "learning_rate": 5.335555555555556e-06,
755
+ "loss": 0.0243,
756
+ "step": 2600
757
+ },
758
+ {
759
+ "epoch": 3.8895478131949592,
760
+ "grad_norm": 1.8445031642913818,
761
+ "learning_rate": 5.28e-06,
762
+ "loss": 0.024,
763
+ "step": 2625
764
+ },
765
+ {
766
+ "epoch": 3.9266123054114157,
767
+ "grad_norm": 1.357937216758728,
768
+ "learning_rate": 5.224444444444445e-06,
769
+ "loss": 0.0244,
770
+ "step": 2650
771
+ },
772
+ {
773
+ "epoch": 3.9636767976278726,
774
+ "grad_norm": 1.0413466691970825,
775
+ "learning_rate": 5.168888888888889e-06,
776
+ "loss": 0.0221,
777
+ "step": 2675
778
+ },
779
+ {
780
+ "epoch": 4.0,
781
+ "grad_norm": 3.0572996139526367,
782
+ "learning_rate": 5.113333333333333e-06,
783
+ "loss": 0.0206,
784
+ "step": 2700
785
+ },
786
+ {
787
+ "epoch": 4.037064492216457,
788
+ "grad_norm": 0.9961848258972168,
789
+ "learning_rate": 5.057777777777778e-06,
790
+ "loss": 0.0136,
791
+ "step": 2725
792
+ },
793
+ {
794
+ "epoch": 4.074128984432913,
795
+ "grad_norm": 1.0248702764511108,
796
+ "learning_rate": 5.002222222222223e-06,
797
+ "loss": 0.009,
798
+ "step": 2750
799
+ },
800
+ {
801
+ "epoch": 4.11119347664937,
802
+ "grad_norm": 0.6142157912254333,
803
+ "learning_rate": 4.946666666666667e-06,
804
+ "loss": 0.0113,
805
+ "step": 2775
806
+ },
807
+ {
808
+ "epoch": 4.148257968865827,
809
+ "grad_norm": 0.27292531728744507,
810
+ "learning_rate": 4.891111111111111e-06,
811
+ "loss": 0.009,
812
+ "step": 2800
813
+ },
814
+ {
815
+ "epoch": 4.185322461082283,
816
+ "grad_norm": 2.2906312942504883,
817
+ "learning_rate": 4.835555555555556e-06,
818
+ "loss": 0.0073,
819
+ "step": 2825
820
+ },
821
+ {
822
+ "epoch": 4.22238695329874,
823
+ "grad_norm": 1.0498850345611572,
824
+ "learning_rate": 4.78e-06,
825
+ "loss": 0.0093,
826
+ "step": 2850
827
+ },
828
+ {
829
+ "epoch": 4.259451445515197,
830
+ "grad_norm": 1.1574844121932983,
831
+ "learning_rate": 4.724444444444445e-06,
832
+ "loss": 0.0159,
833
+ "step": 2875
834
+ },
835
+ {
836
+ "epoch": 4.2965159377316535,
837
+ "grad_norm": 0.7209671139717102,
838
+ "learning_rate": 4.66888888888889e-06,
839
+ "loss": 0.0088,
840
+ "step": 2900
841
+ },
842
+ {
843
+ "epoch": 4.3335804299481095,
844
+ "grad_norm": 1.168841004371643,
845
+ "learning_rate": 4.613333333333334e-06,
846
+ "loss": 0.0094,
847
+ "step": 2925
848
+ },
849
+ {
850
+ "epoch": 4.3706449221645665,
851
+ "grad_norm": 0.6153778433799744,
852
+ "learning_rate": 4.557777777777778e-06,
853
+ "loss": 0.009,
854
+ "step": 2950
855
+ },
856
+ {
857
+ "epoch": 4.407709414381023,
858
+ "grad_norm": 1.5705232620239258,
859
+ "learning_rate": 4.502222222222223e-06,
860
+ "loss": 0.0085,
861
+ "step": 2975
862
+ },
863
+ {
864
+ "epoch": 4.444773906597479,
865
+ "grad_norm": 0.24448032677173615,
866
+ "learning_rate": 4.446666666666667e-06,
867
+ "loss": 0.0072,
868
+ "step": 3000
869
+ },
870
+ {
871
+ "epoch": 4.444773906597479,
872
+ "eval_loss": 0.27286583185195923,
873
+ "eval_runtime": 739.8615,
874
+ "eval_samples_per_second": 3.899,
875
+ "eval_steps_per_second": 0.488,
876
+ "eval_wer": 0.11913474692202462,
877
+ "step": 3000
878
+ },
879
+ {
880
+ "epoch": 4.481838398813936,
881
+ "grad_norm": 1.2278587818145752,
882
+ "learning_rate": 4.391111111111112e-06,
883
+ "loss": 0.0146,
884
+ "step": 3025
885
+ },
886
+ {
887
+ "epoch": 4.518902891030393,
888
+ "grad_norm": 0.6478213667869568,
889
+ "learning_rate": 4.3355555555555565e-06,
890
+ "loss": 0.014,
891
+ "step": 3050
892
+ },
893
+ {
894
+ "epoch": 4.555967383246849,
895
+ "grad_norm": 0.7865190505981445,
896
+ "learning_rate": 4.2800000000000005e-06,
897
+ "loss": 0.0079,
898
+ "step": 3075
899
+ },
900
+ {
901
+ "epoch": 4.593031875463306,
902
+ "grad_norm": 2.3078877925872803,
903
+ "learning_rate": 4.2244444444444446e-06,
904
+ "loss": 0.009,
905
+ "step": 3100
906
+ },
907
+ {
908
+ "epoch": 4.630096367679763,
909
+ "grad_norm": 0.9625842571258545,
910
+ "learning_rate": 4.168888888888889e-06,
911
+ "loss": 0.0096,
912
+ "step": 3125
913
+ },
914
+ {
915
+ "epoch": 4.667160859896219,
916
+ "grad_norm": 0.7619579434394836,
917
+ "learning_rate": 4.1133333333333335e-06,
918
+ "loss": 0.0096,
919
+ "step": 3150
920
+ },
921
+ {
922
+ "epoch": 4.704225352112676,
923
+ "grad_norm": 1.5049270391464233,
924
+ "learning_rate": 4.057777777777778e-06,
925
+ "loss": 0.0099,
926
+ "step": 3175
927
+ },
928
+ {
929
+ "epoch": 4.741289844329133,
930
+ "grad_norm": 1.1056573390960693,
931
+ "learning_rate": 4.002222222222222e-06,
932
+ "loss": 0.0065,
933
+ "step": 3200
934
+ },
935
+ {
936
+ "epoch": 4.778354336545589,
937
+ "grad_norm": 0.7983392477035522,
938
+ "learning_rate": 3.946666666666667e-06,
939
+ "loss": 0.0105,
940
+ "step": 3225
941
+ },
942
+ {
943
+ "epoch": 4.815418828762046,
944
+ "grad_norm": 1.1153795719146729,
945
+ "learning_rate": 3.891111111111111e-06,
946
+ "loss": 0.0075,
947
+ "step": 3250
948
+ },
949
+ {
950
+ "epoch": 4.852483320978503,
951
+ "grad_norm": 0.9730608463287354,
952
+ "learning_rate": 3.835555555555555e-06,
953
+ "loss": 0.0087,
954
+ "step": 3275
955
+ },
956
+ {
957
+ "epoch": 4.889547813194959,
958
+ "grad_norm": 0.5694206953048706,
959
+ "learning_rate": 3.7800000000000002e-06,
960
+ "loss": 0.0071,
961
+ "step": 3300
962
+ },
963
+ {
964
+ "epoch": 4.926612305411416,
965
+ "grad_norm": 0.2520028352737427,
966
+ "learning_rate": 3.724444444444445e-06,
967
+ "loss": 0.0081,
968
+ "step": 3325
969
+ },
970
+ {
971
+ "epoch": 4.963676797627873,
972
+ "grad_norm": 0.436355322599411,
973
+ "learning_rate": 3.668888888888889e-06,
974
+ "loss": 0.0078,
975
+ "step": 3350
976
+ },
977
+ {
978
+ "epoch": 5.0,
979
+ "grad_norm": 0.798361599445343,
980
+ "learning_rate": 3.6133333333333336e-06,
981
+ "loss": 0.0075,
982
+ "step": 3375
983
+ },
984
+ {
985
+ "epoch": 5.037064492216457,
986
+ "grad_norm": 1.3702267408370972,
987
+ "learning_rate": 3.5577777777777785e-06,
988
+ "loss": 0.005,
989
+ "step": 3400
990
+ },
991
+ {
992
+ "epoch": 5.074128984432913,
993
+ "grad_norm": 0.2790464162826538,
994
+ "learning_rate": 3.5022222222222225e-06,
995
+ "loss": 0.0032,
996
+ "step": 3425
997
+ },
998
+ {
999
+ "epoch": 5.11119347664937,
1000
+ "grad_norm": 0.15111476182937622,
1001
+ "learning_rate": 3.446666666666667e-06,
1002
+ "loss": 0.0046,
1003
+ "step": 3450
1004
+ },
1005
+ {
1006
+ "epoch": 5.148257968865827,
1007
+ "grad_norm": 0.09985285252332687,
1008
+ "learning_rate": 3.391111111111111e-06,
1009
+ "loss": 0.0035,
1010
+ "step": 3475
1011
+ },
1012
+ {
1013
+ "epoch": 5.185322461082283,
1014
+ "grad_norm": 0.5352105498313904,
1015
+ "learning_rate": 3.335555555555556e-06,
1016
+ "loss": 0.0031,
1017
+ "step": 3500
1018
+ },
1019
+ {
1020
+ "epoch": 5.22238695329874,
1021
+ "grad_norm": 0.9406213760375977,
1022
+ "learning_rate": 3.2800000000000004e-06,
1023
+ "loss": 0.0035,
1024
+ "step": 3525
1025
+ },
1026
+ {
1027
+ "epoch": 5.259451445515197,
1028
+ "grad_norm": 0.7073507905006409,
1029
+ "learning_rate": 3.2244444444444444e-06,
1030
+ "loss": 0.0035,
1031
+ "step": 3550
1032
+ },
1033
+ {
1034
+ "epoch": 5.2965159377316535,
1035
+ "grad_norm": 0.07916448265314102,
1036
+ "learning_rate": 3.1688888888888893e-06,
1037
+ "loss": 0.0035,
1038
+ "step": 3575
1039
+ },
1040
+ {
1041
+ "epoch": 5.3335804299481095,
1042
+ "grad_norm": 0.5285120606422424,
1043
+ "learning_rate": 3.1133333333333337e-06,
1044
+ "loss": 0.0027,
1045
+ "step": 3600
1046
+ },
1047
+ {
1048
+ "epoch": 5.3706449221645665,
1049
+ "grad_norm": 0.09832775592803955,
1050
+ "learning_rate": 3.0577777777777778e-06,
1051
+ "loss": 0.0036,
1052
+ "step": 3625
1053
+ },
1054
+ {
1055
+ "epoch": 5.407709414381023,
1056
+ "grad_norm": 0.21083103120326996,
1057
+ "learning_rate": 3.0022222222222227e-06,
1058
+ "loss": 0.0041,
1059
+ "step": 3650
1060
+ },
1061
+ {
1062
+ "epoch": 5.444773906597479,
1063
+ "grad_norm": 0.6747980713844299,
1064
+ "learning_rate": 2.946666666666667e-06,
1065
+ "loss": 0.003,
1066
+ "step": 3675
1067
+ },
1068
+ {
1069
+ "epoch": 5.481838398813936,
1070
+ "grad_norm": 0.5111549496650696,
1071
+ "learning_rate": 2.891111111111111e-06,
1072
+ "loss": 0.0028,
1073
+ "step": 3700
1074
+ },
1075
+ {
1076
+ "epoch": 5.518902891030393,
1077
+ "grad_norm": 0.6502516269683838,
1078
+ "learning_rate": 2.835555555555556e-06,
1079
+ "loss": 0.0045,
1080
+ "step": 3725
1081
+ },
1082
+ {
1083
+ "epoch": 5.555967383246849,
1084
+ "grad_norm": 0.4688964784145355,
1085
+ "learning_rate": 2.7800000000000005e-06,
1086
+ "loss": 0.0036,
1087
+ "step": 3750
1088
+ },
1089
+ {
1090
+ "epoch": 5.593031875463306,
1091
+ "grad_norm": 0.281994104385376,
1092
+ "learning_rate": 2.7244444444444445e-06,
1093
+ "loss": 0.0021,
1094
+ "step": 3775
1095
+ },
1096
+ {
1097
+ "epoch": 5.630096367679763,
1098
+ "grad_norm": 0.11583279073238373,
1099
+ "learning_rate": 2.6688888888888894e-06,
1100
+ "loss": 0.0041,
1101
+ "step": 3800
1102
+ },
1103
+ {
1104
+ "epoch": 5.667160859896219,
1105
+ "grad_norm": 0.22941534221172333,
1106
+ "learning_rate": 2.6133333333333334e-06,
1107
+ "loss": 0.0022,
1108
+ "step": 3825
1109
+ },
1110
+ {
1111
+ "epoch": 5.704225352112676,
1112
+ "grad_norm": 0.13950073719024658,
1113
+ "learning_rate": 2.557777777777778e-06,
1114
+ "loss": 0.003,
1115
+ "step": 3850
1116
+ },
1117
+ {
1118
+ "epoch": 5.741289844329133,
1119
+ "grad_norm": 0.6869206428527832,
1120
+ "learning_rate": 2.5022222222222224e-06,
1121
+ "loss": 0.0024,
1122
+ "step": 3875
1123
+ },
1124
+ {
1125
+ "epoch": 5.778354336545589,
1126
+ "grad_norm": 0.09893081337213516,
1127
+ "learning_rate": 2.446666666666667e-06,
1128
+ "loss": 0.0029,
1129
+ "step": 3900
1130
+ },
1131
+ {
1132
+ "epoch": 5.815418828762046,
1133
+ "grad_norm": 0.1264762133359909,
1134
+ "learning_rate": 2.3911111111111113e-06,
1135
+ "loss": 0.0033,
1136
+ "step": 3925
1137
+ },
1138
+ {
1139
+ "epoch": 5.852483320978503,
1140
+ "grad_norm": 0.15489889681339264,
1141
+ "learning_rate": 2.3355555555555557e-06,
1142
+ "loss": 0.003,
1143
+ "step": 3950
1144
+ },
1145
+ {
1146
+ "epoch": 5.889547813194959,
1147
+ "grad_norm": 0.5875250697135925,
1148
+ "learning_rate": 2.28e-06,
1149
+ "loss": 0.0022,
1150
+ "step": 3975
1151
+ },
1152
+ {
1153
+ "epoch": 5.926612305411416,
1154
+ "grad_norm": 0.06691984087228775,
1155
+ "learning_rate": 2.2244444444444447e-06,
1156
+ "loss": 0.005,
1157
+ "step": 4000
1158
+ },
1159
+ {
1160
+ "epoch": 5.926612305411416,
1161
+ "eval_loss": 0.28099098801612854,
1162
+ "eval_runtime": 734.9707,
1163
+ "eval_samples_per_second": 3.925,
1164
+ "eval_steps_per_second": 0.491,
1165
+ "eval_wer": 0.11566347469220246,
1166
+ "step": 4000
1167
+ },
1168
+ {
1169
+ "epoch": 5.963676797627873,
1170
+ "grad_norm": 0.2645249664783478,
1171
+ "learning_rate": 2.168888888888889e-06,
1172
+ "loss": 0.0026,
1173
+ "step": 4025
1174
+ },
1175
+ {
1176
+ "epoch": 6.0,
1177
+ "grad_norm": 0.3361597955226898,
1178
+ "learning_rate": 2.1133333333333336e-06,
1179
+ "loss": 0.0023,
1180
+ "step": 4050
1181
+ },
1182
+ {
1183
+ "epoch": 6.037064492216457,
1184
+ "grad_norm": 0.059147898107767105,
1185
+ "learning_rate": 2.057777777777778e-06,
1186
+ "loss": 0.0015,
1187
+ "step": 4075
1188
+ },
1189
+ {
1190
+ "epoch": 6.074128984432913,
1191
+ "grad_norm": 0.1158735603094101,
1192
+ "learning_rate": 2.0022222222222225e-06,
1193
+ "loss": 0.0016,
1194
+ "step": 4100
1195
+ },
1196
+ {
1197
+ "epoch": 6.11119347664937,
1198
+ "grad_norm": 1.3564985990524292,
1199
+ "learning_rate": 1.9466666666666665e-06,
1200
+ "loss": 0.0014,
1201
+ "step": 4125
1202
+ },
1203
+ {
1204
+ "epoch": 6.148257968865827,
1205
+ "grad_norm": 0.5956087112426758,
1206
+ "learning_rate": 1.8911111111111114e-06,
1207
+ "loss": 0.0018,
1208
+ "step": 4150
1209
+ },
1210
+ {
1211
+ "epoch": 6.185322461082283,
1212
+ "grad_norm": 0.09224885702133179,
1213
+ "learning_rate": 1.8355555555555557e-06,
1214
+ "loss": 0.0017,
1215
+ "step": 4175
1216
+ },
1217
+ {
1218
+ "epoch": 6.22238695329874,
1219
+ "grad_norm": 0.06868930906057358,
1220
+ "learning_rate": 1.7800000000000001e-06,
1221
+ "loss": 0.0017,
1222
+ "step": 4200
1223
+ },
1224
+ {
1225
+ "epoch": 6.259451445515197,
1226
+ "grad_norm": 0.06657718122005463,
1227
+ "learning_rate": 1.7244444444444448e-06,
1228
+ "loss": 0.0014,
1229
+ "step": 4225
1230
+ },
1231
+ {
1232
+ "epoch": 6.2965159377316535,
1233
+ "grad_norm": 0.05459928885102272,
1234
+ "learning_rate": 1.668888888888889e-06,
1235
+ "loss": 0.0017,
1236
+ "step": 4250
1237
+ },
1238
+ {
1239
+ "epoch": 6.3335804299481095,
1240
+ "grad_norm": 0.05795517563819885,
1241
+ "learning_rate": 1.6133333333333335e-06,
1242
+ "loss": 0.0027,
1243
+ "step": 4275
1244
+ },
1245
+ {
1246
+ "epoch": 6.3706449221645665,
1247
+ "grad_norm": 0.06204914301633835,
1248
+ "learning_rate": 1.5577777777777777e-06,
1249
+ "loss": 0.0012,
1250
+ "step": 4300
1251
+ },
1252
+ {
1253
+ "epoch": 6.407709414381023,
1254
+ "grad_norm": 0.0820712074637413,
1255
+ "learning_rate": 1.5022222222222224e-06,
1256
+ "loss": 0.0012,
1257
+ "step": 4325
1258
+ },
1259
+ {
1260
+ "epoch": 6.444773906597479,
1261
+ "grad_norm": 0.056523606181144714,
1262
+ "learning_rate": 1.4466666666666669e-06,
1263
+ "loss": 0.0013,
1264
+ "step": 4350
1265
+ },
1266
+ {
1267
+ "epoch": 6.481838398813936,
1268
+ "grad_norm": 0.07985592633485794,
1269
+ "learning_rate": 1.3911111111111111e-06,
1270
+ "loss": 0.0014,
1271
+ "step": 4375
1272
+ },
1273
+ {
1274
+ "epoch": 6.518902891030393,
1275
+ "grad_norm": 0.044111426919698715,
1276
+ "learning_rate": 1.3355555555555558e-06,
1277
+ "loss": 0.0012,
1278
+ "step": 4400
1279
+ },
1280
+ {
1281
+ "epoch": 6.555967383246849,
1282
+ "grad_norm": 0.05683915689587593,
1283
+ "learning_rate": 1.28e-06,
1284
+ "loss": 0.0014,
1285
+ "step": 4425
1286
+ },
1287
+ {
1288
+ "epoch": 6.593031875463306,
1289
+ "grad_norm": 0.08568093180656433,
1290
+ "learning_rate": 1.2244444444444445e-06,
1291
+ "loss": 0.0012,
1292
+ "step": 4450
1293
+ },
1294
+ {
1295
+ "epoch": 6.630096367679763,
1296
+ "grad_norm": 0.054062824696302414,
1297
+ "learning_rate": 1.168888888888889e-06,
1298
+ "loss": 0.0011,
1299
+ "step": 4475
1300
+ },
1301
+ {
1302
+ "epoch": 6.667160859896219,
1303
+ "grad_norm": 0.0509476363658905,
1304
+ "learning_rate": 1.1133333333333334e-06,
1305
+ "loss": 0.0013,
1306
+ "step": 4500
1307
+ },
1308
+ {
1309
+ "epoch": 6.704225352112676,
1310
+ "grad_norm": 0.04927874356508255,
1311
+ "learning_rate": 1.0577777777777779e-06,
1312
+ "loss": 0.0012,
1313
+ "step": 4525
1314
+ },
1315
+ {
1316
+ "epoch": 6.741289844329133,
1317
+ "grad_norm": 0.08598697185516357,
1318
+ "learning_rate": 1.0022222222222223e-06,
1319
+ "loss": 0.0011,
1320
+ "step": 4550
1321
+ },
1322
+ {
1323
+ "epoch": 6.778354336545589,
1324
+ "grad_norm": 0.3571934700012207,
1325
+ "learning_rate": 9.466666666666667e-07,
1326
+ "loss": 0.0016,
1327
+ "step": 4575
1328
+ },
1329
+ {
1330
+ "epoch": 6.815418828762046,
1331
+ "grad_norm": 0.05977300554513931,
1332
+ "learning_rate": 8.911111111111112e-07,
1333
+ "loss": 0.001,
1334
+ "step": 4600
1335
+ },
1336
+ {
1337
+ "epoch": 6.852483320978503,
1338
+ "grad_norm": 0.05966237559914589,
1339
+ "learning_rate": 8.355555555555556e-07,
1340
+ "loss": 0.001,
1341
+ "step": 4625
1342
+ },
1343
+ {
1344
+ "epoch": 6.889547813194959,
1345
+ "grad_norm": 0.05432112514972687,
1346
+ "learning_rate": 7.8e-07,
1347
+ "loss": 0.001,
1348
+ "step": 4650
1349
+ },
1350
+ {
1351
+ "epoch": 6.926612305411416,
1352
+ "grad_norm": 0.06741122156381607,
1353
+ "learning_rate": 7.244444444444446e-07,
1354
+ "loss": 0.0019,
1355
+ "step": 4675
1356
+ },
1357
+ {
1358
+ "epoch": 6.963676797627873,
1359
+ "grad_norm": 0.04723643884062767,
1360
+ "learning_rate": 6.68888888888889e-07,
1361
+ "loss": 0.0012,
1362
+ "step": 4700
1363
+ },
1364
+ {
1365
+ "epoch": 7.0,
1366
+ "grad_norm": 0.07329325377941132,
1367
+ "learning_rate": 6.133333333333333e-07,
1368
+ "loss": 0.001,
1369
+ "step": 4725
1370
+ },
1371
+ {
1372
+ "epoch": 7.037064492216457,
1373
+ "grad_norm": 0.06389188766479492,
1374
+ "learning_rate": 5.577777777777779e-07,
1375
+ "loss": 0.001,
1376
+ "step": 4750
1377
+ },
1378
+ {
1379
+ "epoch": 7.074128984432913,
1380
+ "grad_norm": 0.03797365352511406,
1381
+ "learning_rate": 5.022222222222222e-07,
1382
+ "loss": 0.001,
1383
+ "step": 4775
1384
+ },
1385
+ {
1386
+ "epoch": 7.11119347664937,
1387
+ "grad_norm": 0.04686768725514412,
1388
+ "learning_rate": 4.466666666666667e-07,
1389
+ "loss": 0.0009,
1390
+ "step": 4800
1391
+ },
1392
+ {
1393
+ "epoch": 7.148257968865827,
1394
+ "grad_norm": 0.06883518397808075,
1395
+ "learning_rate": 3.9111111111111115e-07,
1396
+ "loss": 0.001,
1397
+ "step": 4825
1398
+ },
1399
+ {
1400
+ "epoch": 7.185322461082283,
1401
+ "grad_norm": 0.02842629700899124,
1402
+ "learning_rate": 3.3555555555555556e-07,
1403
+ "loss": 0.0009,
1404
+ "step": 4850
1405
+ },
1406
+ {
1407
+ "epoch": 7.22238695329874,
1408
+ "grad_norm": 0.04749394953250885,
1409
+ "learning_rate": 2.8e-07,
1410
+ "loss": 0.001,
1411
+ "step": 4875
1412
+ },
1413
+ {
1414
+ "epoch": 7.259451445515197,
1415
+ "grad_norm": 0.04491546377539635,
1416
+ "learning_rate": 2.2444444444444445e-07,
1417
+ "loss": 0.001,
1418
+ "step": 4900
1419
+ },
1420
+ {
1421
+ "epoch": 7.2965159377316535,
1422
+ "grad_norm": 0.056013334542512894,
1423
+ "learning_rate": 1.6888888888888888e-07,
1424
+ "loss": 0.001,
1425
+ "step": 4925
1426
+ },
1427
+ {
1428
+ "epoch": 7.3335804299481095,
1429
+ "grad_norm": 0.057778194546699524,
1430
+ "learning_rate": 1.1333333333333336e-07,
1431
+ "loss": 0.0011,
1432
+ "step": 4950
1433
+ },
1434
+ {
1435
+ "epoch": 7.3706449221645665,
1436
+ "grad_norm": 0.051241885870695114,
1437
+ "learning_rate": 5.777777777777778e-08,
1438
+ "loss": 0.0011,
1439
+ "step": 4975
1440
+ },
1441
+ {
1442
+ "epoch": 7.407709414381023,
1443
+ "grad_norm": 0.06301814317703247,
1444
+ "learning_rate": 2.2222222222222225e-09,
1445
+ "loss": 0.0009,
1446
+ "step": 5000
1447
+ },
1448
+ {
1449
+ "epoch": 7.407709414381023,
1450
+ "eval_loss": 0.29011788964271545,
1451
+ "eval_runtime": 732.4342,
1452
+ "eval_samples_per_second": 3.939,
1453
+ "eval_steps_per_second": 0.493,
1454
+ "eval_wer": 0.1146545827633379,
1455
+ "step": 5000
1456
+ },
1457
+ {
1458
+ "epoch": 7.407709414381023,
1459
+ "step": 5000,
1460
+ "total_flos": 8.155551755501568e+19,
1461
+ "train_loss": 0.10907779041565954,
1462
+ "train_runtime": 12394.4337,
1463
+ "train_samples_per_second": 6.455,
1464
+ "train_steps_per_second": 0.403
1465
  }
1466
  ],
1467
  "logging_steps": 25,
1468
+ "max_steps": 5000,
1469
  "num_input_tokens_seen": 0,
1470
+ "num_train_epochs": 8,
1471
  "save_steps": 1000,
1472
  "stateful_callbacks": {
1473
  "TrainerControl": {
 
1481
  "attributes": {}
1482
  }
1483
  },
1484
+ "total_flos": 8.155551755501568e+19,
1485
+ "train_batch_size": 8,
1486
  "trial_name": null,
1487
  "trial_params": null
1488
  }