EYEDOL commited on
Commit
62af3f6
·
verified ·
1 Parent(s): 4dd487d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-1000/config.json +47 -0
  2. checkpoint-1000/generation_config.json +175 -0
  3. checkpoint-1000/model.safetensors +3 -0
  4. checkpoint-1000/optimizer.pt +3 -0
  5. checkpoint-1000/preprocessor_config.json +15 -0
  6. checkpoint-1000/rng_state.pth +3 -0
  7. checkpoint-1000/scaler.pt +3 -0
  8. checkpoint-1000/scheduler.pt +3 -0
  9. checkpoint-1000/trainer_state.json +429 -0
  10. checkpoint-1000/training_args.bin +3 -0
  11. checkpoint-1200/config.json +47 -0
  12. checkpoint-1200/generation_config.json +175 -0
  13. checkpoint-1200/model.safetensors +3 -0
  14. checkpoint-1200/optimizer.pt +3 -0
  15. checkpoint-1200/preprocessor_config.json +15 -0
  16. checkpoint-1200/rng_state.pth +3 -0
  17. checkpoint-1200/scaler.pt +3 -0
  18. checkpoint-1200/scheduler.pt +3 -0
  19. checkpoint-1200/trainer_state.json +508 -0
  20. checkpoint-1200/training_args.bin +3 -0
  21. checkpoint-1400/config.json +47 -0
  22. checkpoint-1400/generation_config.json +175 -0
  23. checkpoint-1400/model.safetensors +3 -0
  24. checkpoint-1400/optimizer.pt +3 -0
  25. checkpoint-1400/preprocessor_config.json +15 -0
  26. checkpoint-1400/rng_state.pth +3 -0
  27. checkpoint-1400/scaler.pt +3 -0
  28. checkpoint-1400/scheduler.pt +3 -0
  29. checkpoint-1400/trainer_state.json +587 -0
  30. checkpoint-1400/training_args.bin +3 -0
  31. checkpoint-1600/config.json +47 -0
  32. checkpoint-1600/generation_config.json +175 -0
  33. checkpoint-1600/model.safetensors +3 -0
  34. checkpoint-1600/optimizer.pt +3 -0
  35. checkpoint-1600/preprocessor_config.json +15 -0
  36. checkpoint-1600/rng_state.pth +3 -0
  37. checkpoint-1600/scaler.pt +3 -0
  38. checkpoint-1600/scheduler.pt +3 -0
  39. checkpoint-1600/trainer_state.json +666 -0
  40. checkpoint-1600/training_args.bin +3 -0
  41. checkpoint-1800/config.json +47 -0
  42. checkpoint-1800/generation_config.json +175 -0
  43. checkpoint-1800/model.safetensors +3 -0
  44. checkpoint-1800/optimizer.pt +3 -0
  45. checkpoint-1800/preprocessor_config.json +15 -0
  46. checkpoint-1800/rng_state.pth +3 -0
  47. checkpoint-1800/scaler.pt +3 -0
  48. checkpoint-1800/scheduler.pt +3 -0
  49. checkpoint-1800/trainer_state.json +745 -0
  50. checkpoint-1800/training_args.bin +3 -0
checkpoint-1000/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": null,
10
+ "bos_token_id": 50257,
11
+ "classifier_proj_size": 256,
12
+ "d_model": 768,
13
+ "decoder_attention_heads": 12,
14
+ "decoder_ffn_dim": 3072,
15
+ "decoder_layerdrop": 0.0,
16
+ "decoder_layers": 12,
17
+ "decoder_start_token_id": 50258,
18
+ "dropout": 0.0,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 12,
23
+ "eos_token_id": 50257,
24
+ "forced_decoder_ids": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "mask_feature_length": 10,
28
+ "mask_feature_min_masks": 0,
29
+ "mask_feature_prob": 0.0,
30
+ "mask_time_length": 10,
31
+ "mask_time_min_masks": 2,
32
+ "mask_time_prob": 0.05,
33
+ "max_length": null,
34
+ "max_source_positions": 1500,
35
+ "max_target_positions": 448,
36
+ "median_filter_width": 7,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 12,
39
+ "num_mel_bins": 80,
40
+ "pad_token_id": 50257,
41
+ "scale_embedding": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_cache": false,
45
+ "use_weighted_layer_sum": false,
46
+ "vocab_size": 51865
47
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 5,
5
+ 3
6
+ ],
7
+ [
8
+ 5,
9
+ 9
10
+ ],
11
+ [
12
+ 8,
13
+ 0
14
+ ],
15
+ [
16
+ 8,
17
+ 4
18
+ ],
19
+ [
20
+ 8,
21
+ 7
22
+ ],
23
+ [
24
+ 8,
25
+ 8
26
+ ],
27
+ [
28
+ 9,
29
+ 0
30
+ ],
31
+ [
32
+ 9,
33
+ 7
34
+ ],
35
+ [
36
+ 9,
37
+ 9
38
+ ],
39
+ [
40
+ 10,
41
+ 5
42
+ ]
43
+ ],
44
+ "begin_suppress_tokens": [
45
+ 220,
46
+ 50257
47
+ ],
48
+ "bos_token_id": 50257,
49
+ "decoder_start_token_id": 50258,
50
+ "eos_token_id": 50257,
51
+ "forced_decoder_ids": [
52
+ [
53
+ 1,
54
+ null
55
+ ],
56
+ [
57
+ 2,
58
+ 50359
59
+ ]
60
+ ],
61
+ "is_multilingual": true,
62
+ "lang_to_id": {
63
+ "<|af|>": 50327,
64
+ "<|am|>": 50334,
65
+ "<|ar|>": 50272,
66
+ "<|as|>": 50350,
67
+ "<|az|>": 50304,
68
+ "<|ba|>": 50355,
69
+ "<|be|>": 50330,
70
+ "<|bg|>": 50292,
71
+ "<|bn|>": 50302,
72
+ "<|bo|>": 50347,
73
+ "<|br|>": 50309,
74
+ "<|bs|>": 50315,
75
+ "<|ca|>": 50270,
76
+ "<|cs|>": 50283,
77
+ "<|cy|>": 50297,
78
+ "<|da|>": 50285,
79
+ "<|de|>": 50261,
80
+ "<|el|>": 50281,
81
+ "<|en|>": 50259,
82
+ "<|es|>": 50262,
83
+ "<|et|>": 50307,
84
+ "<|eu|>": 50310,
85
+ "<|fa|>": 50300,
86
+ "<|fi|>": 50277,
87
+ "<|fo|>": 50338,
88
+ "<|fr|>": 50265,
89
+ "<|gl|>": 50319,
90
+ "<|gu|>": 50333,
91
+ "<|haw|>": 50352,
92
+ "<|ha|>": 50354,
93
+ "<|he|>": 50279,
94
+ "<|hi|>": 50276,
95
+ "<|hr|>": 50291,
96
+ "<|ht|>": 50339,
97
+ "<|hu|>": 50286,
98
+ "<|hy|>": 50312,
99
+ "<|id|>": 50275,
100
+ "<|is|>": 50311,
101
+ "<|it|>": 50274,
102
+ "<|ja|>": 50266,
103
+ "<|jw|>": 50356,
104
+ "<|ka|>": 50329,
105
+ "<|kk|>": 50316,
106
+ "<|km|>": 50323,
107
+ "<|kn|>": 50306,
108
+ "<|ko|>": 50264,
109
+ "<|la|>": 50294,
110
+ "<|lb|>": 50345,
111
+ "<|ln|>": 50353,
112
+ "<|lo|>": 50336,
113
+ "<|lt|>": 50293,
114
+ "<|lv|>": 50301,
115
+ "<|mg|>": 50349,
116
+ "<|mi|>": 50295,
117
+ "<|mk|>": 50308,
118
+ "<|ml|>": 50296,
119
+ "<|mn|>": 50314,
120
+ "<|mr|>": 50320,
121
+ "<|ms|>": 50282,
122
+ "<|mt|>": 50343,
123
+ "<|my|>": 50346,
124
+ "<|ne|>": 50313,
125
+ "<|nl|>": 50271,
126
+ "<|nn|>": 50342,
127
+ "<|no|>": 50288,
128
+ "<|oc|>": 50328,
129
+ "<|pa|>": 50321,
130
+ "<|pl|>": 50269,
131
+ "<|ps|>": 50340,
132
+ "<|pt|>": 50267,
133
+ "<|ro|>": 50284,
134
+ "<|ru|>": 50263,
135
+ "<|sa|>": 50344,
136
+ "<|sd|>": 50332,
137
+ "<|si|>": 50322,
138
+ "<|sk|>": 50298,
139
+ "<|sl|>": 50305,
140
+ "<|sn|>": 50324,
141
+ "<|so|>": 50326,
142
+ "<|sq|>": 50317,
143
+ "<|sr|>": 50303,
144
+ "<|su|>": 50357,
145
+ "<|sv|>": 50273,
146
+ "<|sw|>": 50318,
147
+ "<|ta|>": 50287,
148
+ "<|te|>": 50299,
149
+ "<|tg|>": 50331,
150
+ "<|th|>": 50289,
151
+ "<|tk|>": 50341,
152
+ "<|tl|>": 50348,
153
+ "<|tr|>": 50268,
154
+ "<|tt|>": 50351,
155
+ "<|uk|>": 50280,
156
+ "<|ur|>": 50290,
157
+ "<|uz|>": 50337,
158
+ "<|vi|>": 50278,
159
+ "<|yi|>": 50335,
160
+ "<|yo|>": 50325,
161
+ "<|zh|>": 50260
162
+ },
163
+ "max_initial_timestamp_index": 50,
164
+ "max_length": 448,
165
+ "no_timestamps_token_id": 50363,
166
+ "pad_token_id": 50257,
167
+ "prev_sot_token_id": 50361,
168
+ "return_timestamps": false,
169
+ "suppress_tokens": [],
170
+ "task_to_id": {
171
+ "transcribe": 50359,
172
+ "translate": 50358
173
+ },
174
+ "transformers_version": "4.51.3"
175
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:516965ba5e615b23964c74658dc69b0b20127cc5e33a7e0f0f60b45805c48dbe
3
+ size 966995080
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a8ffcdd2d3002c7325f6bda25881965d45a68acda6014188e5992798d2bedec
3
+ size 1925064044
checkpoint-1000/preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:834c781436b196ed932372aaa6ec6dbfb0ebf83a994d376adbdcf25c4778b05e
3
+ size 14244
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a694eb2d6a253a859eb936acdef0e4e6559cd78dd4e08463d9a6ca4f6f44b832
3
+ size 988
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76adde150b7edfb3ae13f98b035fa2e0eea67ff8b9245b007f253d93a74b937c
3
+ size 1064
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 38.00771402098731,
4
+ "best_model_checkpoint": "./HAUSA_B/checkpoint-1000",
5
+ "epoch": 1.3534190927555856,
6
+ "eval_steps": 200,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.027081922816519974,
14
+ "grad_norm": 106.59215545654297,
15
+ "learning_rate": 8.000000000000001e-07,
16
+ "loss": 6.1915,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.05416384563303995,
21
+ "grad_norm": 30.563518524169922,
22
+ "learning_rate": 1.8000000000000001e-06,
23
+ "loss": 4.5886,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.08124576844955991,
28
+ "grad_norm": 16.528457641601562,
29
+ "learning_rate": 2.8000000000000003e-06,
30
+ "loss": 3.0338,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.1083276912660799,
35
+ "grad_norm": 13.852096557617188,
36
+ "learning_rate": 3.8000000000000005e-06,
37
+ "loss": 2.3188,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.13540961408259986,
42
+ "grad_norm": 13.284646987915039,
43
+ "learning_rate": 4.800000000000001e-06,
44
+ "loss": 1.9725,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.16249153689911983,
49
+ "grad_norm": 13.212055206298828,
50
+ "learning_rate": 5.8e-06,
51
+ "loss": 1.7839,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.1895734597156398,
56
+ "grad_norm": 12.006990432739258,
57
+ "learning_rate": 6.800000000000001e-06,
58
+ "loss": 1.5991,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.2166553825321598,
63
+ "grad_norm": 12.490514755249023,
64
+ "learning_rate": 7.800000000000002e-06,
65
+ "loss": 1.4411,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.24373730534867977,
70
+ "grad_norm": 11.587646484375,
71
+ "learning_rate": 8.8e-06,
72
+ "loss": 1.3487,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.2708192281651997,
77
+ "grad_norm": 11.574933052062988,
78
+ "learning_rate": 9.800000000000001e-06,
79
+ "loss": 1.2119,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.2708192281651997,
84
+ "eval_loss": 1.1157827377319336,
85
+ "eval_runtime": 1601.2991,
86
+ "eval_samples_per_second": 3.688,
87
+ "eval_steps_per_second": 0.462,
88
+ "eval_wer": 64.22461160550755,
89
+ "step": 200
90
+ },
91
+ {
92
+ "epoch": 0.2979011509817197,
93
+ "grad_norm": 8.368565559387207,
94
+ "learning_rate": 9.920556107249256e-06,
95
+ "loss": 0.9452,
96
+ "step": 220
97
+ },
98
+ {
99
+ "epoch": 0.32498307379823965,
100
+ "grad_norm": 7.2616682052612305,
101
+ "learning_rate": 9.821251241310825e-06,
102
+ "loss": 0.9298,
103
+ "step": 240
104
+ },
105
+ {
106
+ "epoch": 0.35206499661475965,
107
+ "grad_norm": 7.266394138336182,
108
+ "learning_rate": 9.721946375372395e-06,
109
+ "loss": 0.8567,
110
+ "step": 260
111
+ },
112
+ {
113
+ "epoch": 0.3791469194312796,
114
+ "grad_norm": 8.120067596435547,
115
+ "learning_rate": 9.622641509433963e-06,
116
+ "loss": 0.8251,
117
+ "step": 280
118
+ },
119
+ {
120
+ "epoch": 0.4062288422477996,
121
+ "grad_norm": 7.483547687530518,
122
+ "learning_rate": 9.523336643495532e-06,
123
+ "loss": 0.7774,
124
+ "step": 300
125
+ },
126
+ {
127
+ "epoch": 0.4333107650643196,
128
+ "grad_norm": 7.521896839141846,
129
+ "learning_rate": 9.4240317775571e-06,
130
+ "loss": 0.7266,
131
+ "step": 320
132
+ },
133
+ {
134
+ "epoch": 0.46039268788083954,
135
+ "grad_norm": 7.073266506195068,
136
+ "learning_rate": 9.32472691161867e-06,
137
+ "loss": 0.7248,
138
+ "step": 340
139
+ },
140
+ {
141
+ "epoch": 0.48747461069735953,
142
+ "grad_norm": 6.335423469543457,
143
+ "learning_rate": 9.22542204568024e-06,
144
+ "loss": 0.7151,
145
+ "step": 360
146
+ },
147
+ {
148
+ "epoch": 0.5145565335138795,
149
+ "grad_norm": 6.936922550201416,
150
+ "learning_rate": 9.126117179741808e-06,
151
+ "loss": 0.6881,
152
+ "step": 380
153
+ },
154
+ {
155
+ "epoch": 0.5416384563303994,
156
+ "grad_norm": 7.596807479858398,
157
+ "learning_rate": 9.026812313803377e-06,
158
+ "loss": 0.6995,
159
+ "step": 400
160
+ },
161
+ {
162
+ "epoch": 0.5416384563303994,
163
+ "eval_loss": 0.6352065801620483,
164
+ "eval_runtime": 1597.3635,
165
+ "eval_samples_per_second": 3.697,
166
+ "eval_steps_per_second": 0.463,
167
+ "eval_wer": 51.63653601672089,
168
+ "step": 400
169
+ },
170
+ {
171
+ "epoch": 0.5687203791469194,
172
+ "grad_norm": 6.981812477111816,
173
+ "learning_rate": 8.927507447864945e-06,
174
+ "loss": 0.6795,
175
+ "step": 420
176
+ },
177
+ {
178
+ "epoch": 0.5958023019634394,
179
+ "grad_norm": 6.481506824493408,
180
+ "learning_rate": 8.828202581926516e-06,
181
+ "loss": 0.6643,
182
+ "step": 440
183
+ },
184
+ {
185
+ "epoch": 0.6228842247799594,
186
+ "grad_norm": 6.537086009979248,
187
+ "learning_rate": 8.728897715988084e-06,
188
+ "loss": 0.6459,
189
+ "step": 460
190
+ },
191
+ {
192
+ "epoch": 0.6499661475964793,
193
+ "grad_norm": 6.739567756652832,
194
+ "learning_rate": 8.629592850049653e-06,
195
+ "loss": 0.6736,
196
+ "step": 480
197
+ },
198
+ {
199
+ "epoch": 0.6770480704129993,
200
+ "grad_norm": 7.422546863555908,
201
+ "learning_rate": 8.530287984111221e-06,
202
+ "loss": 0.6591,
203
+ "step": 500
204
+ },
205
+ {
206
+ "epoch": 0.7041299932295193,
207
+ "grad_norm": 5.7051215171813965,
208
+ "learning_rate": 8.430983118172792e-06,
209
+ "loss": 0.6442,
210
+ "step": 520
211
+ },
212
+ {
213
+ "epoch": 0.7312119160460393,
214
+ "grad_norm": 7.166143417358398,
215
+ "learning_rate": 8.33167825223436e-06,
216
+ "loss": 0.6326,
217
+ "step": 540
218
+ },
219
+ {
220
+ "epoch": 0.7582938388625592,
221
+ "grad_norm": 7.759460926055908,
222
+ "learning_rate": 8.232373386295929e-06,
223
+ "loss": 0.6328,
224
+ "step": 560
225
+ },
226
+ {
227
+ "epoch": 0.7853757616790792,
228
+ "grad_norm": 5.876537799835205,
229
+ "learning_rate": 8.133068520357497e-06,
230
+ "loss": 0.5945,
231
+ "step": 580
232
+ },
233
+ {
234
+ "epoch": 0.8124576844955992,
235
+ "grad_norm": 6.475106716156006,
236
+ "learning_rate": 8.033763654419066e-06,
237
+ "loss": 0.6038,
238
+ "step": 600
239
+ },
240
+ {
241
+ "epoch": 0.8124576844955992,
242
+ "eval_loss": 0.5318673849105835,
243
+ "eval_runtime": 1614.0202,
244
+ "eval_samples_per_second": 3.659,
245
+ "eval_steps_per_second": 0.458,
246
+ "eval_wer": 44.61419121291129,
247
+ "step": 600
248
+ },
249
+ {
250
+ "epoch": 0.8395396073121192,
251
+ "grad_norm": 6.8348259925842285,
252
+ "learning_rate": 7.934458788480636e-06,
253
+ "loss": 0.6046,
254
+ "step": 620
255
+ },
256
+ {
257
+ "epoch": 0.8666215301286392,
258
+ "grad_norm": 6.060784339904785,
259
+ "learning_rate": 7.835153922542206e-06,
260
+ "loss": 0.6055,
261
+ "step": 640
262
+ },
263
+ {
264
+ "epoch": 0.8937034529451591,
265
+ "grad_norm": 6.01262903213501,
266
+ "learning_rate": 7.735849056603775e-06,
267
+ "loss": 0.6139,
268
+ "step": 660
269
+ },
270
+ {
271
+ "epoch": 0.9207853757616791,
272
+ "grad_norm": 6.706854820251465,
273
+ "learning_rate": 7.636544190665344e-06,
274
+ "loss": 0.5789,
275
+ "step": 680
276
+ },
277
+ {
278
+ "epoch": 0.9478672985781991,
279
+ "grad_norm": 6.464681625366211,
280
+ "learning_rate": 7.537239324726913e-06,
281
+ "loss": 0.5857,
282
+ "step": 700
283
+ },
284
+ {
285
+ "epoch": 0.9749492213947191,
286
+ "grad_norm": 7.8035478591918945,
287
+ "learning_rate": 7.437934458788482e-06,
288
+ "loss": 0.5739,
289
+ "step": 720
290
+ },
291
+ {
292
+ "epoch": 1.001354096140826,
293
+ "grad_norm": 5.574532985687256,
294
+ "learning_rate": 7.33862959285005e-06,
295
+ "loss": 0.5443,
296
+ "step": 740
297
+ },
298
+ {
299
+ "epoch": 1.028436018957346,
300
+ "grad_norm": 5.275153160095215,
301
+ "learning_rate": 7.23932472691162e-06,
302
+ "loss": 0.4705,
303
+ "step": 760
304
+ },
305
+ {
306
+ "epoch": 1.055517941773866,
307
+ "grad_norm": 5.722527027130127,
308
+ "learning_rate": 7.140019860973188e-06,
309
+ "loss": 0.4692,
310
+ "step": 780
311
+ },
312
+ {
313
+ "epoch": 1.0825998645903858,
314
+ "grad_norm": 5.501762866973877,
315
+ "learning_rate": 7.040714995034758e-06,
316
+ "loss": 0.4791,
317
+ "step": 800
318
+ },
319
+ {
320
+ "epoch": 1.0825998645903858,
321
+ "eval_loss": 0.46895861625671387,
322
+ "eval_runtime": 1599.6376,
323
+ "eval_samples_per_second": 3.691,
324
+ "eval_steps_per_second": 0.462,
325
+ "eval_wer": 40.9726561658299,
326
+ "step": 800
327
+ },
328
+ {
329
+ "epoch": 1.1096817874069058,
330
+ "grad_norm": 5.207645893096924,
331
+ "learning_rate": 6.941410129096326e-06,
332
+ "loss": 0.4569,
333
+ "step": 820
334
+ },
335
+ {
336
+ "epoch": 1.1367637102234258,
337
+ "grad_norm": 5.584475517272949,
338
+ "learning_rate": 6.842105263157896e-06,
339
+ "loss": 0.4262,
340
+ "step": 840
341
+ },
342
+ {
343
+ "epoch": 1.1638456330399458,
344
+ "grad_norm": 5.881894588470459,
345
+ "learning_rate": 6.742800397219464e-06,
346
+ "loss": 0.4533,
347
+ "step": 860
348
+ },
349
+ {
350
+ "epoch": 1.1909275558564658,
351
+ "grad_norm": 6.939334869384766,
352
+ "learning_rate": 6.643495531281034e-06,
353
+ "loss": 0.4675,
354
+ "step": 880
355
+ },
356
+ {
357
+ "epoch": 1.2180094786729858,
358
+ "grad_norm": 5.780360698699951,
359
+ "learning_rate": 6.544190665342602e-06,
360
+ "loss": 0.4238,
361
+ "step": 900
362
+ },
363
+ {
364
+ "epoch": 1.2450914014895058,
365
+ "grad_norm": 5.867160797119141,
366
+ "learning_rate": 6.444885799404172e-06,
367
+ "loss": 0.444,
368
+ "step": 920
369
+ },
370
+ {
371
+ "epoch": 1.2721733243060258,
372
+ "grad_norm": 6.121824741363525,
373
+ "learning_rate": 6.34558093346574e-06,
374
+ "loss": 0.4333,
375
+ "step": 940
376
+ },
377
+ {
378
+ "epoch": 1.2992552471225456,
379
+ "grad_norm": 5.132157802581787,
380
+ "learning_rate": 6.24627606752731e-06,
381
+ "loss": 0.4544,
382
+ "step": 960
383
+ },
384
+ {
385
+ "epoch": 1.3263371699390656,
386
+ "grad_norm": 6.384315013885498,
387
+ "learning_rate": 6.146971201588878e-06,
388
+ "loss": 0.4332,
389
+ "step": 980
390
+ },
391
+ {
392
+ "epoch": 1.3534190927555856,
393
+ "grad_norm": 4.54695987701416,
394
+ "learning_rate": 6.047666335650447e-06,
395
+ "loss": 0.4416,
396
+ "step": 1000
397
+ },
398
+ {
399
+ "epoch": 1.3534190927555856,
400
+ "eval_loss": 0.43043428659439087,
401
+ "eval_runtime": 1614.3899,
402
+ "eval_samples_per_second": 3.658,
403
+ "eval_steps_per_second": 0.458,
404
+ "eval_wer": 38.00771402098731,
405
+ "step": 1000
406
+ }
407
+ ],
408
+ "logging_steps": 20,
409
+ "max_steps": 2214,
410
+ "num_input_tokens_seen": 0,
411
+ "num_train_epochs": 3,
412
+ "save_steps": 200,
413
+ "stateful_callbacks": {
414
+ "TrainerControl": {
415
+ "args": {
416
+ "should_epoch_stop": false,
417
+ "should_evaluate": false,
418
+ "should_log": false,
419
+ "should_save": true,
420
+ "should_training_stop": false
421
+ },
422
+ "attributes": {}
423
+ }
424
+ },
425
+ "total_flos": 9.22665241903104e+18,
426
+ "train_batch_size": 16,
427
+ "trial_name": null,
428
+ "trial_params": null
429
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d182356a12cee606170360f976946e1a56ce9d494fc1de269ffa184a80a19aa
3
+ size 5432
checkpoint-1200/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": null,
10
+ "bos_token_id": 50257,
11
+ "classifier_proj_size": 256,
12
+ "d_model": 768,
13
+ "decoder_attention_heads": 12,
14
+ "decoder_ffn_dim": 3072,
15
+ "decoder_layerdrop": 0.0,
16
+ "decoder_layers": 12,
17
+ "decoder_start_token_id": 50258,
18
+ "dropout": 0.0,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 12,
23
+ "eos_token_id": 50257,
24
+ "forced_decoder_ids": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "mask_feature_length": 10,
28
+ "mask_feature_min_masks": 0,
29
+ "mask_feature_prob": 0.0,
30
+ "mask_time_length": 10,
31
+ "mask_time_min_masks": 2,
32
+ "mask_time_prob": 0.05,
33
+ "max_length": null,
34
+ "max_source_positions": 1500,
35
+ "max_target_positions": 448,
36
+ "median_filter_width": 7,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 12,
39
+ "num_mel_bins": 80,
40
+ "pad_token_id": 50257,
41
+ "scale_embedding": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_cache": false,
45
+ "use_weighted_layer_sum": false,
46
+ "vocab_size": 51865
47
+ }
checkpoint-1200/generation_config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 5,
5
+ 3
6
+ ],
7
+ [
8
+ 5,
9
+ 9
10
+ ],
11
+ [
12
+ 8,
13
+ 0
14
+ ],
15
+ [
16
+ 8,
17
+ 4
18
+ ],
19
+ [
20
+ 8,
21
+ 7
22
+ ],
23
+ [
24
+ 8,
25
+ 8
26
+ ],
27
+ [
28
+ 9,
29
+ 0
30
+ ],
31
+ [
32
+ 9,
33
+ 7
34
+ ],
35
+ [
36
+ 9,
37
+ 9
38
+ ],
39
+ [
40
+ 10,
41
+ 5
42
+ ]
43
+ ],
44
+ "begin_suppress_tokens": [
45
+ 220,
46
+ 50257
47
+ ],
48
+ "bos_token_id": 50257,
49
+ "decoder_start_token_id": 50258,
50
+ "eos_token_id": 50257,
51
+ "forced_decoder_ids": [
52
+ [
53
+ 1,
54
+ null
55
+ ],
56
+ [
57
+ 2,
58
+ 50359
59
+ ]
60
+ ],
61
+ "is_multilingual": true,
62
+ "lang_to_id": {
63
+ "<|af|>": 50327,
64
+ "<|am|>": 50334,
65
+ "<|ar|>": 50272,
66
+ "<|as|>": 50350,
67
+ "<|az|>": 50304,
68
+ "<|ba|>": 50355,
69
+ "<|be|>": 50330,
70
+ "<|bg|>": 50292,
71
+ "<|bn|>": 50302,
72
+ "<|bo|>": 50347,
73
+ "<|br|>": 50309,
74
+ "<|bs|>": 50315,
75
+ "<|ca|>": 50270,
76
+ "<|cs|>": 50283,
77
+ "<|cy|>": 50297,
78
+ "<|da|>": 50285,
79
+ "<|de|>": 50261,
80
+ "<|el|>": 50281,
81
+ "<|en|>": 50259,
82
+ "<|es|>": 50262,
83
+ "<|et|>": 50307,
84
+ "<|eu|>": 50310,
85
+ "<|fa|>": 50300,
86
+ "<|fi|>": 50277,
87
+ "<|fo|>": 50338,
88
+ "<|fr|>": 50265,
89
+ "<|gl|>": 50319,
90
+ "<|gu|>": 50333,
91
+ "<|haw|>": 50352,
92
+ "<|ha|>": 50354,
93
+ "<|he|>": 50279,
94
+ "<|hi|>": 50276,
95
+ "<|hr|>": 50291,
96
+ "<|ht|>": 50339,
97
+ "<|hu|>": 50286,
98
+ "<|hy|>": 50312,
99
+ "<|id|>": 50275,
100
+ "<|is|>": 50311,
101
+ "<|it|>": 50274,
102
+ "<|ja|>": 50266,
103
+ "<|jw|>": 50356,
104
+ "<|ka|>": 50329,
105
+ "<|kk|>": 50316,
106
+ "<|km|>": 50323,
107
+ "<|kn|>": 50306,
108
+ "<|ko|>": 50264,
109
+ "<|la|>": 50294,
110
+ "<|lb|>": 50345,
111
+ "<|ln|>": 50353,
112
+ "<|lo|>": 50336,
113
+ "<|lt|>": 50293,
114
+ "<|lv|>": 50301,
115
+ "<|mg|>": 50349,
116
+ "<|mi|>": 50295,
117
+ "<|mk|>": 50308,
118
+ "<|ml|>": 50296,
119
+ "<|mn|>": 50314,
120
+ "<|mr|>": 50320,
121
+ "<|ms|>": 50282,
122
+ "<|mt|>": 50343,
123
+ "<|my|>": 50346,
124
+ "<|ne|>": 50313,
125
+ "<|nl|>": 50271,
126
+ "<|nn|>": 50342,
127
+ "<|no|>": 50288,
128
+ "<|oc|>": 50328,
129
+ "<|pa|>": 50321,
130
+ "<|pl|>": 50269,
131
+ "<|ps|>": 50340,
132
+ "<|pt|>": 50267,
133
+ "<|ro|>": 50284,
134
+ "<|ru|>": 50263,
135
+ "<|sa|>": 50344,
136
+ "<|sd|>": 50332,
137
+ "<|si|>": 50322,
138
+ "<|sk|>": 50298,
139
+ "<|sl|>": 50305,
140
+ "<|sn|>": 50324,
141
+ "<|so|>": 50326,
142
+ "<|sq|>": 50317,
143
+ "<|sr|>": 50303,
144
+ "<|su|>": 50357,
145
+ "<|sv|>": 50273,
146
+ "<|sw|>": 50318,
147
+ "<|ta|>": 50287,
148
+ "<|te|>": 50299,
149
+ "<|tg|>": 50331,
150
+ "<|th|>": 50289,
151
+ "<|tk|>": 50341,
152
+ "<|tl|>": 50348,
153
+ "<|tr|>": 50268,
154
+ "<|tt|>": 50351,
155
+ "<|uk|>": 50280,
156
+ "<|ur|>": 50290,
157
+ "<|uz|>": 50337,
158
+ "<|vi|>": 50278,
159
+ "<|yi|>": 50335,
160
+ "<|yo|>": 50325,
161
+ "<|zh|>": 50260
162
+ },
163
+ "max_initial_timestamp_index": 50,
164
+ "max_length": 448,
165
+ "no_timestamps_token_id": 50363,
166
+ "pad_token_id": 50257,
167
+ "prev_sot_token_id": 50361,
168
+ "return_timestamps": false,
169
+ "suppress_tokens": [],
170
+ "task_to_id": {
171
+ "transcribe": 50359,
172
+ "translate": 50358
173
+ },
174
+ "transformers_version": "4.51.3"
175
+ }
checkpoint-1200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb14949829a92f21b5831610a3e1c5818517a70e7b78c91869165be3fa027220
3
+ size 966995080
checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73d3c742b7a9d7e3ae2729a416b557ee7782f301ef925dea88ee4c0b277e087b
3
+ size 1925064044
checkpoint-1200/preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ee6e7cb7371fbb90ac429f9ed02af5fb766eaadcd01bf63f86ad9dfa2abf6c6
3
+ size 14244
checkpoint-1200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2bba2d3316c5a034cb242c96fb8e8d64a45bd5676f615fdf565b22b5760dea1
3
+ size 988
checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a2a9f4d0fd0f57c0ac9e3087fe25fc38ac9090191d38f259c2fb5164d10dce8
3
+ size 1064
checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1200,
3
+ "best_metric": 37.091943373052636,
4
+ "best_model_checkpoint": "./HAUSA_B/checkpoint-1200",
5
+ "epoch": 1.6242383209207854,
6
+ "eval_steps": 200,
7
+ "global_step": 1200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.027081922816519974,
14
+ "grad_norm": 106.59215545654297,
15
+ "learning_rate": 8.000000000000001e-07,
16
+ "loss": 6.1915,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.05416384563303995,
21
+ "grad_norm": 30.563518524169922,
22
+ "learning_rate": 1.8000000000000001e-06,
23
+ "loss": 4.5886,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.08124576844955991,
28
+ "grad_norm": 16.528457641601562,
29
+ "learning_rate": 2.8000000000000003e-06,
30
+ "loss": 3.0338,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.1083276912660799,
35
+ "grad_norm": 13.852096557617188,
36
+ "learning_rate": 3.8000000000000005e-06,
37
+ "loss": 2.3188,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.13540961408259986,
42
+ "grad_norm": 13.284646987915039,
43
+ "learning_rate": 4.800000000000001e-06,
44
+ "loss": 1.9725,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.16249153689911983,
49
+ "grad_norm": 13.212055206298828,
50
+ "learning_rate": 5.8e-06,
51
+ "loss": 1.7839,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.1895734597156398,
56
+ "grad_norm": 12.006990432739258,
57
+ "learning_rate": 6.800000000000001e-06,
58
+ "loss": 1.5991,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.2166553825321598,
63
+ "grad_norm": 12.490514755249023,
64
+ "learning_rate": 7.800000000000002e-06,
65
+ "loss": 1.4411,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.24373730534867977,
70
+ "grad_norm": 11.587646484375,
71
+ "learning_rate": 8.8e-06,
72
+ "loss": 1.3487,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.2708192281651997,
77
+ "grad_norm": 11.574933052062988,
78
+ "learning_rate": 9.800000000000001e-06,
79
+ "loss": 1.2119,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.2708192281651997,
84
+ "eval_loss": 1.1157827377319336,
85
+ "eval_runtime": 1601.2991,
86
+ "eval_samples_per_second": 3.688,
87
+ "eval_steps_per_second": 0.462,
88
+ "eval_wer": 64.22461160550755,
89
+ "step": 200
90
+ },
91
+ {
92
+ "epoch": 0.2979011509817197,
93
+ "grad_norm": 8.368565559387207,
94
+ "learning_rate": 9.920556107249256e-06,
95
+ "loss": 0.9452,
96
+ "step": 220
97
+ },
98
+ {
99
+ "epoch": 0.32498307379823965,
100
+ "grad_norm": 7.2616682052612305,
101
+ "learning_rate": 9.821251241310825e-06,
102
+ "loss": 0.9298,
103
+ "step": 240
104
+ },
105
+ {
106
+ "epoch": 0.35206499661475965,
107
+ "grad_norm": 7.266394138336182,
108
+ "learning_rate": 9.721946375372395e-06,
109
+ "loss": 0.8567,
110
+ "step": 260
111
+ },
112
+ {
113
+ "epoch": 0.3791469194312796,
114
+ "grad_norm": 8.120067596435547,
115
+ "learning_rate": 9.622641509433963e-06,
116
+ "loss": 0.8251,
117
+ "step": 280
118
+ },
119
+ {
120
+ "epoch": 0.4062288422477996,
121
+ "grad_norm": 7.483547687530518,
122
+ "learning_rate": 9.523336643495532e-06,
123
+ "loss": 0.7774,
124
+ "step": 300
125
+ },
126
+ {
127
+ "epoch": 0.4333107650643196,
128
+ "grad_norm": 7.521896839141846,
129
+ "learning_rate": 9.4240317775571e-06,
130
+ "loss": 0.7266,
131
+ "step": 320
132
+ },
133
+ {
134
+ "epoch": 0.46039268788083954,
135
+ "grad_norm": 7.073266506195068,
136
+ "learning_rate": 9.32472691161867e-06,
137
+ "loss": 0.7248,
138
+ "step": 340
139
+ },
140
+ {
141
+ "epoch": 0.48747461069735953,
142
+ "grad_norm": 6.335423469543457,
143
+ "learning_rate": 9.22542204568024e-06,
144
+ "loss": 0.7151,
145
+ "step": 360
146
+ },
147
+ {
148
+ "epoch": 0.5145565335138795,
149
+ "grad_norm": 6.936922550201416,
150
+ "learning_rate": 9.126117179741808e-06,
151
+ "loss": 0.6881,
152
+ "step": 380
153
+ },
154
+ {
155
+ "epoch": 0.5416384563303994,
156
+ "grad_norm": 7.596807479858398,
157
+ "learning_rate": 9.026812313803377e-06,
158
+ "loss": 0.6995,
159
+ "step": 400
160
+ },
161
+ {
162
+ "epoch": 0.5416384563303994,
163
+ "eval_loss": 0.6352065801620483,
164
+ "eval_runtime": 1597.3635,
165
+ "eval_samples_per_second": 3.697,
166
+ "eval_steps_per_second": 0.463,
167
+ "eval_wer": 51.63653601672089,
168
+ "step": 400
169
+ },
170
+ {
171
+ "epoch": 0.5687203791469194,
172
+ "grad_norm": 6.981812477111816,
173
+ "learning_rate": 8.927507447864945e-06,
174
+ "loss": 0.6795,
175
+ "step": 420
176
+ },
177
+ {
178
+ "epoch": 0.5958023019634394,
179
+ "grad_norm": 6.481506824493408,
180
+ "learning_rate": 8.828202581926516e-06,
181
+ "loss": 0.6643,
182
+ "step": 440
183
+ },
184
+ {
185
+ "epoch": 0.6228842247799594,
186
+ "grad_norm": 6.537086009979248,
187
+ "learning_rate": 8.728897715988084e-06,
188
+ "loss": 0.6459,
189
+ "step": 460
190
+ },
191
+ {
192
+ "epoch": 0.6499661475964793,
193
+ "grad_norm": 6.739567756652832,
194
+ "learning_rate": 8.629592850049653e-06,
195
+ "loss": 0.6736,
196
+ "step": 480
197
+ },
198
+ {
199
+ "epoch": 0.6770480704129993,
200
+ "grad_norm": 7.422546863555908,
201
+ "learning_rate": 8.530287984111221e-06,
202
+ "loss": 0.6591,
203
+ "step": 500
204
+ },
205
+ {
206
+ "epoch": 0.7041299932295193,
207
+ "grad_norm": 5.7051215171813965,
208
+ "learning_rate": 8.430983118172792e-06,
209
+ "loss": 0.6442,
210
+ "step": 520
211
+ },
212
+ {
213
+ "epoch": 0.7312119160460393,
214
+ "grad_norm": 7.166143417358398,
215
+ "learning_rate": 8.33167825223436e-06,
216
+ "loss": 0.6326,
217
+ "step": 540
218
+ },
219
+ {
220
+ "epoch": 0.7582938388625592,
221
+ "grad_norm": 7.759460926055908,
222
+ "learning_rate": 8.232373386295929e-06,
223
+ "loss": 0.6328,
224
+ "step": 560
225
+ },
226
+ {
227
+ "epoch": 0.7853757616790792,
228
+ "grad_norm": 5.876537799835205,
229
+ "learning_rate": 8.133068520357497e-06,
230
+ "loss": 0.5945,
231
+ "step": 580
232
+ },
233
+ {
234
+ "epoch": 0.8124576844955992,
235
+ "grad_norm": 6.475106716156006,
236
+ "learning_rate": 8.033763654419066e-06,
237
+ "loss": 0.6038,
238
+ "step": 600
239
+ },
240
+ {
241
+ "epoch": 0.8124576844955992,
242
+ "eval_loss": 0.5318673849105835,
243
+ "eval_runtime": 1614.0202,
244
+ "eval_samples_per_second": 3.659,
245
+ "eval_steps_per_second": 0.458,
246
+ "eval_wer": 44.61419121291129,
247
+ "step": 600
248
+ },
249
+ {
250
+ "epoch": 0.8395396073121192,
251
+ "grad_norm": 6.8348259925842285,
252
+ "learning_rate": 7.934458788480636e-06,
253
+ "loss": 0.6046,
254
+ "step": 620
255
+ },
256
+ {
257
+ "epoch": 0.8666215301286392,
258
+ "grad_norm": 6.060784339904785,
259
+ "learning_rate": 7.835153922542206e-06,
260
+ "loss": 0.6055,
261
+ "step": 640
262
+ },
263
+ {
264
+ "epoch": 0.8937034529451591,
265
+ "grad_norm": 6.01262903213501,
266
+ "learning_rate": 7.735849056603775e-06,
267
+ "loss": 0.6139,
268
+ "step": 660
269
+ },
270
+ {
271
+ "epoch": 0.9207853757616791,
272
+ "grad_norm": 6.706854820251465,
273
+ "learning_rate": 7.636544190665344e-06,
274
+ "loss": 0.5789,
275
+ "step": 680
276
+ },
277
+ {
278
+ "epoch": 0.9478672985781991,
279
+ "grad_norm": 6.464681625366211,
280
+ "learning_rate": 7.537239324726913e-06,
281
+ "loss": 0.5857,
282
+ "step": 700
283
+ },
284
+ {
285
+ "epoch": 0.9749492213947191,
286
+ "grad_norm": 7.8035478591918945,
287
+ "learning_rate": 7.437934458788482e-06,
288
+ "loss": 0.5739,
289
+ "step": 720
290
+ },
291
+ {
292
+ "epoch": 1.001354096140826,
293
+ "grad_norm": 5.574532985687256,
294
+ "learning_rate": 7.33862959285005e-06,
295
+ "loss": 0.5443,
296
+ "step": 740
297
+ },
298
+ {
299
+ "epoch": 1.028436018957346,
300
+ "grad_norm": 5.275153160095215,
301
+ "learning_rate": 7.23932472691162e-06,
302
+ "loss": 0.4705,
303
+ "step": 760
304
+ },
305
+ {
306
+ "epoch": 1.055517941773866,
307
+ "grad_norm": 5.722527027130127,
308
+ "learning_rate": 7.140019860973188e-06,
309
+ "loss": 0.4692,
310
+ "step": 780
311
+ },
312
+ {
313
+ "epoch": 1.0825998645903858,
314
+ "grad_norm": 5.501762866973877,
315
+ "learning_rate": 7.040714995034758e-06,
316
+ "loss": 0.4791,
317
+ "step": 800
318
+ },
319
+ {
320
+ "epoch": 1.0825998645903858,
321
+ "eval_loss": 0.46895861625671387,
322
+ "eval_runtime": 1599.6376,
323
+ "eval_samples_per_second": 3.691,
324
+ "eval_steps_per_second": 0.462,
325
+ "eval_wer": 40.9726561658299,
326
+ "step": 800
327
+ },
328
+ {
329
+ "epoch": 1.1096817874069058,
330
+ "grad_norm": 5.207645893096924,
331
+ "learning_rate": 6.941410129096326e-06,
332
+ "loss": 0.4569,
333
+ "step": 820
334
+ },
335
+ {
336
+ "epoch": 1.1367637102234258,
337
+ "grad_norm": 5.584475517272949,
338
+ "learning_rate": 6.842105263157896e-06,
339
+ "loss": 0.4262,
340
+ "step": 840
341
+ },
342
+ {
343
+ "epoch": 1.1638456330399458,
344
+ "grad_norm": 5.881894588470459,
345
+ "learning_rate": 6.742800397219464e-06,
346
+ "loss": 0.4533,
347
+ "step": 860
348
+ },
349
+ {
350
+ "epoch": 1.1909275558564658,
351
+ "grad_norm": 6.939334869384766,
352
+ "learning_rate": 6.643495531281034e-06,
353
+ "loss": 0.4675,
354
+ "step": 880
355
+ },
356
+ {
357
+ "epoch": 1.2180094786729858,
358
+ "grad_norm": 5.780360698699951,
359
+ "learning_rate": 6.544190665342602e-06,
360
+ "loss": 0.4238,
361
+ "step": 900
362
+ },
363
+ {
364
+ "epoch": 1.2450914014895058,
365
+ "grad_norm": 5.867160797119141,
366
+ "learning_rate": 6.444885799404172e-06,
367
+ "loss": 0.444,
368
+ "step": 920
369
+ },
370
+ {
371
+ "epoch": 1.2721733243060258,
372
+ "grad_norm": 6.121824741363525,
373
+ "learning_rate": 6.34558093346574e-06,
374
+ "loss": 0.4333,
375
+ "step": 940
376
+ },
377
+ {
378
+ "epoch": 1.2992552471225456,
379
+ "grad_norm": 5.132157802581787,
380
+ "learning_rate": 6.24627606752731e-06,
381
+ "loss": 0.4544,
382
+ "step": 960
383
+ },
384
+ {
385
+ "epoch": 1.3263371699390656,
386
+ "grad_norm": 6.384315013885498,
387
+ "learning_rate": 6.146971201588878e-06,
388
+ "loss": 0.4332,
389
+ "step": 980
390
+ },
391
+ {
392
+ "epoch": 1.3534190927555856,
393
+ "grad_norm": 4.54695987701416,
394
+ "learning_rate": 6.047666335650447e-06,
395
+ "loss": 0.4416,
396
+ "step": 1000
397
+ },
398
+ {
399
+ "epoch": 1.3534190927555856,
400
+ "eval_loss": 0.43043428659439087,
401
+ "eval_runtime": 1614.3899,
402
+ "eval_samples_per_second": 3.658,
403
+ "eval_steps_per_second": 0.458,
404
+ "eval_wer": 38.00771402098731,
405
+ "step": 1000
406
+ },
407
+ {
408
+ "epoch": 1.3805010155721056,
409
+ "grad_norm": 4.973900318145752,
410
+ "learning_rate": 5.948361469712016e-06,
411
+ "loss": 0.4401,
412
+ "step": 1020
413
+ },
414
+ {
415
+ "epoch": 1.4075829383886256,
416
+ "grad_norm": 5.952788352966309,
417
+ "learning_rate": 5.849056603773585e-06,
418
+ "loss": 0.4865,
419
+ "step": 1040
420
+ },
421
+ {
422
+ "epoch": 1.4346648612051456,
423
+ "grad_norm": 6.601942539215088,
424
+ "learning_rate": 5.749751737835154e-06,
425
+ "loss": 0.4435,
426
+ "step": 1060
427
+ },
428
+ {
429
+ "epoch": 1.4617467840216656,
430
+ "grad_norm": 6.17143440246582,
431
+ "learning_rate": 5.650446871896723e-06,
432
+ "loss": 0.451,
433
+ "step": 1080
434
+ },
435
+ {
436
+ "epoch": 1.4888287068381856,
437
+ "grad_norm": 5.782886981964111,
438
+ "learning_rate": 5.551142005958292e-06,
439
+ "loss": 0.4311,
440
+ "step": 1100
441
+ },
442
+ {
443
+ "epoch": 1.5159106296547056,
444
+ "grad_norm": 5.734127998352051,
445
+ "learning_rate": 5.451837140019861e-06,
446
+ "loss": 0.4632,
447
+ "step": 1120
448
+ },
449
+ {
450
+ "epoch": 1.5429925524712256,
451
+ "grad_norm": 5.601761341094971,
452
+ "learning_rate": 5.35253227408143e-06,
453
+ "loss": 0.424,
454
+ "step": 1140
455
+ },
456
+ {
457
+ "epoch": 1.5700744752877456,
458
+ "grad_norm": 5.866110801696777,
459
+ "learning_rate": 5.253227408142999e-06,
460
+ "loss": 0.4273,
461
+ "step": 1160
462
+ },
463
+ {
464
+ "epoch": 1.5971563981042654,
465
+ "grad_norm": 5.120361328125,
466
+ "learning_rate": 5.153922542204568e-06,
467
+ "loss": 0.4297,
468
+ "step": 1180
469
+ },
470
+ {
471
+ "epoch": 1.6242383209207854,
472
+ "grad_norm": 5.093082427978516,
473
+ "learning_rate": 5.054617676266137e-06,
474
+ "loss": 0.4321,
475
+ "step": 1200
476
+ },
477
+ {
478
+ "epoch": 1.6242383209207854,
479
+ "eval_loss": 0.3993258476257324,
480
+ "eval_runtime": 1619.3754,
481
+ "eval_samples_per_second": 3.646,
482
+ "eval_steps_per_second": 0.456,
483
+ "eval_wer": 37.091943373052636,
484
+ "step": 1200
485
+ }
486
+ ],
487
+ "logging_steps": 20,
488
+ "max_steps": 2214,
489
+ "num_input_tokens_seen": 0,
490
+ "num_train_epochs": 3,
491
+ "save_steps": 200,
492
+ "stateful_callbacks": {
493
+ "TrainerControl": {
494
+ "args": {
495
+ "should_epoch_stop": false,
496
+ "should_evaluate": false,
497
+ "should_log": false,
498
+ "should_save": true,
499
+ "should_training_stop": false
500
+ },
501
+ "attributes": {}
502
+ }
503
+ },
504
+ "total_flos": 1.107359898107904e+19,
505
+ "train_batch_size": 16,
506
+ "trial_name": null,
507
+ "trial_params": null
508
+ }
checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d182356a12cee606170360f976946e1a56ce9d494fc1de269ffa184a80a19aa
3
+ size 5432
checkpoint-1400/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": null,
10
+ "bos_token_id": 50257,
11
+ "classifier_proj_size": 256,
12
+ "d_model": 768,
13
+ "decoder_attention_heads": 12,
14
+ "decoder_ffn_dim": 3072,
15
+ "decoder_layerdrop": 0.0,
16
+ "decoder_layers": 12,
17
+ "decoder_start_token_id": 50258,
18
+ "dropout": 0.0,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 12,
23
+ "eos_token_id": 50257,
24
+ "forced_decoder_ids": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "mask_feature_length": 10,
28
+ "mask_feature_min_masks": 0,
29
+ "mask_feature_prob": 0.0,
30
+ "mask_time_length": 10,
31
+ "mask_time_min_masks": 2,
32
+ "mask_time_prob": 0.05,
33
+ "max_length": null,
34
+ "max_source_positions": 1500,
35
+ "max_target_positions": 448,
36
+ "median_filter_width": 7,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 12,
39
+ "num_mel_bins": 80,
40
+ "pad_token_id": 50257,
41
+ "scale_embedding": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_cache": false,
45
+ "use_weighted_layer_sum": false,
46
+ "vocab_size": 51865
47
+ }
checkpoint-1400/generation_config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 5,
5
+ 3
6
+ ],
7
+ [
8
+ 5,
9
+ 9
10
+ ],
11
+ [
12
+ 8,
13
+ 0
14
+ ],
15
+ [
16
+ 8,
17
+ 4
18
+ ],
19
+ [
20
+ 8,
21
+ 7
22
+ ],
23
+ [
24
+ 8,
25
+ 8
26
+ ],
27
+ [
28
+ 9,
29
+ 0
30
+ ],
31
+ [
32
+ 9,
33
+ 7
34
+ ],
35
+ [
36
+ 9,
37
+ 9
38
+ ],
39
+ [
40
+ 10,
41
+ 5
42
+ ]
43
+ ],
44
+ "begin_suppress_tokens": [
45
+ 220,
46
+ 50257
47
+ ],
48
+ "bos_token_id": 50257,
49
+ "decoder_start_token_id": 50258,
50
+ "eos_token_id": 50257,
51
+ "forced_decoder_ids": [
52
+ [
53
+ 1,
54
+ null
55
+ ],
56
+ [
57
+ 2,
58
+ 50359
59
+ ]
60
+ ],
61
+ "is_multilingual": true,
62
+ "lang_to_id": {
63
+ "<|af|>": 50327,
64
+ "<|am|>": 50334,
65
+ "<|ar|>": 50272,
66
+ "<|as|>": 50350,
67
+ "<|az|>": 50304,
68
+ "<|ba|>": 50355,
69
+ "<|be|>": 50330,
70
+ "<|bg|>": 50292,
71
+ "<|bn|>": 50302,
72
+ "<|bo|>": 50347,
73
+ "<|br|>": 50309,
74
+ "<|bs|>": 50315,
75
+ "<|ca|>": 50270,
76
+ "<|cs|>": 50283,
77
+ "<|cy|>": 50297,
78
+ "<|da|>": 50285,
79
+ "<|de|>": 50261,
80
+ "<|el|>": 50281,
81
+ "<|en|>": 50259,
82
+ "<|es|>": 50262,
83
+ "<|et|>": 50307,
84
+ "<|eu|>": 50310,
85
+ "<|fa|>": 50300,
86
+ "<|fi|>": 50277,
87
+ "<|fo|>": 50338,
88
+ "<|fr|>": 50265,
89
+ "<|gl|>": 50319,
90
+ "<|gu|>": 50333,
91
+ "<|haw|>": 50352,
92
+ "<|ha|>": 50354,
93
+ "<|he|>": 50279,
94
+ "<|hi|>": 50276,
95
+ "<|hr|>": 50291,
96
+ "<|ht|>": 50339,
97
+ "<|hu|>": 50286,
98
+ "<|hy|>": 50312,
99
+ "<|id|>": 50275,
100
+ "<|is|>": 50311,
101
+ "<|it|>": 50274,
102
+ "<|ja|>": 50266,
103
+ "<|jw|>": 50356,
104
+ "<|ka|>": 50329,
105
+ "<|kk|>": 50316,
106
+ "<|km|>": 50323,
107
+ "<|kn|>": 50306,
108
+ "<|ko|>": 50264,
109
+ "<|la|>": 50294,
110
+ "<|lb|>": 50345,
111
+ "<|ln|>": 50353,
112
+ "<|lo|>": 50336,
113
+ "<|lt|>": 50293,
114
+ "<|lv|>": 50301,
115
+ "<|mg|>": 50349,
116
+ "<|mi|>": 50295,
117
+ "<|mk|>": 50308,
118
+ "<|ml|>": 50296,
119
+ "<|mn|>": 50314,
120
+ "<|mr|>": 50320,
121
+ "<|ms|>": 50282,
122
+ "<|mt|>": 50343,
123
+ "<|my|>": 50346,
124
+ "<|ne|>": 50313,
125
+ "<|nl|>": 50271,
126
+ "<|nn|>": 50342,
127
+ "<|no|>": 50288,
128
+ "<|oc|>": 50328,
129
+ "<|pa|>": 50321,
130
+ "<|pl|>": 50269,
131
+ "<|ps|>": 50340,
132
+ "<|pt|>": 50267,
133
+ "<|ro|>": 50284,
134
+ "<|ru|>": 50263,
135
+ "<|sa|>": 50344,
136
+ "<|sd|>": 50332,
137
+ "<|si|>": 50322,
138
+ "<|sk|>": 50298,
139
+ "<|sl|>": 50305,
140
+ "<|sn|>": 50324,
141
+ "<|so|>": 50326,
142
+ "<|sq|>": 50317,
143
+ "<|sr|>": 50303,
144
+ "<|su|>": 50357,
145
+ "<|sv|>": 50273,
146
+ "<|sw|>": 50318,
147
+ "<|ta|>": 50287,
148
+ "<|te|>": 50299,
149
+ "<|tg|>": 50331,
150
+ "<|th|>": 50289,
151
+ "<|tk|>": 50341,
152
+ "<|tl|>": 50348,
153
+ "<|tr|>": 50268,
154
+ "<|tt|>": 50351,
155
+ "<|uk|>": 50280,
156
+ "<|ur|>": 50290,
157
+ "<|uz|>": 50337,
158
+ "<|vi|>": 50278,
159
+ "<|yi|>": 50335,
160
+ "<|yo|>": 50325,
161
+ "<|zh|>": 50260
162
+ },
163
+ "max_initial_timestamp_index": 50,
164
+ "max_length": 448,
165
+ "no_timestamps_token_id": 50363,
166
+ "pad_token_id": 50257,
167
+ "prev_sot_token_id": 50361,
168
+ "return_timestamps": false,
169
+ "suppress_tokens": [],
170
+ "task_to_id": {
171
+ "transcribe": 50359,
172
+ "translate": 50358
173
+ },
174
+ "transformers_version": "4.51.3"
175
+ }
checkpoint-1400/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1f0ee7d8634de74415ef90e2882c7e96fdac9cee1dad2f36bd4a6036ee092f6
3
+ size 966995080
checkpoint-1400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b40e7c5f8642aef067896fd0c278e4503941bf01b49682e4d60e13cac862df5
3
+ size 1925064044
checkpoint-1400/preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
checkpoint-1400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c06f045bd7c705582a3aca1aaddd4580344f768a5e94a2ef07e8740f5001422
3
+ size 14244
checkpoint-1400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:685987cc5b0f2ee0b68c24f0a8db28864faad52607ac80c3a55717c0108676b8
3
+ size 988
checkpoint-1400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0d27f0a7e829c9c887f2fcf55a867e136d8c795c676853896d6b350d68cf9fe
3
+ size 1064
checkpoint-1400/trainer_state.json ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1400,
3
+ "best_metric": 35.20006895214291,
4
+ "best_model_checkpoint": "./HAUSA_B/checkpoint-1400",
5
+ "epoch": 1.8950575490859851,
6
+ "eval_steps": 200,
7
+ "global_step": 1400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.027081922816519974,
14
+ "grad_norm": 106.59215545654297,
15
+ "learning_rate": 8.000000000000001e-07,
16
+ "loss": 6.1915,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.05416384563303995,
21
+ "grad_norm": 30.563518524169922,
22
+ "learning_rate": 1.8000000000000001e-06,
23
+ "loss": 4.5886,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.08124576844955991,
28
+ "grad_norm": 16.528457641601562,
29
+ "learning_rate": 2.8000000000000003e-06,
30
+ "loss": 3.0338,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.1083276912660799,
35
+ "grad_norm": 13.852096557617188,
36
+ "learning_rate": 3.8000000000000005e-06,
37
+ "loss": 2.3188,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.13540961408259986,
42
+ "grad_norm": 13.284646987915039,
43
+ "learning_rate": 4.800000000000001e-06,
44
+ "loss": 1.9725,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.16249153689911983,
49
+ "grad_norm": 13.212055206298828,
50
+ "learning_rate": 5.8e-06,
51
+ "loss": 1.7839,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.1895734597156398,
56
+ "grad_norm": 12.006990432739258,
57
+ "learning_rate": 6.800000000000001e-06,
58
+ "loss": 1.5991,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.2166553825321598,
63
+ "grad_norm": 12.490514755249023,
64
+ "learning_rate": 7.800000000000002e-06,
65
+ "loss": 1.4411,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.24373730534867977,
70
+ "grad_norm": 11.587646484375,
71
+ "learning_rate": 8.8e-06,
72
+ "loss": 1.3487,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.2708192281651997,
77
+ "grad_norm": 11.574933052062988,
78
+ "learning_rate": 9.800000000000001e-06,
79
+ "loss": 1.2119,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.2708192281651997,
84
+ "eval_loss": 1.1157827377319336,
85
+ "eval_runtime": 1601.2991,
86
+ "eval_samples_per_second": 3.688,
87
+ "eval_steps_per_second": 0.462,
88
+ "eval_wer": 64.22461160550755,
89
+ "step": 200
90
+ },
91
+ {
92
+ "epoch": 0.2979011509817197,
93
+ "grad_norm": 8.368565559387207,
94
+ "learning_rate": 9.920556107249256e-06,
95
+ "loss": 0.9452,
96
+ "step": 220
97
+ },
98
+ {
99
+ "epoch": 0.32498307379823965,
100
+ "grad_norm": 7.2616682052612305,
101
+ "learning_rate": 9.821251241310825e-06,
102
+ "loss": 0.9298,
103
+ "step": 240
104
+ },
105
+ {
106
+ "epoch": 0.35206499661475965,
107
+ "grad_norm": 7.266394138336182,
108
+ "learning_rate": 9.721946375372395e-06,
109
+ "loss": 0.8567,
110
+ "step": 260
111
+ },
112
+ {
113
+ "epoch": 0.3791469194312796,
114
+ "grad_norm": 8.120067596435547,
115
+ "learning_rate": 9.622641509433963e-06,
116
+ "loss": 0.8251,
117
+ "step": 280
118
+ },
119
+ {
120
+ "epoch": 0.4062288422477996,
121
+ "grad_norm": 7.483547687530518,
122
+ "learning_rate": 9.523336643495532e-06,
123
+ "loss": 0.7774,
124
+ "step": 300
125
+ },
126
+ {
127
+ "epoch": 0.4333107650643196,
128
+ "grad_norm": 7.521896839141846,
129
+ "learning_rate": 9.4240317775571e-06,
130
+ "loss": 0.7266,
131
+ "step": 320
132
+ },
133
+ {
134
+ "epoch": 0.46039268788083954,
135
+ "grad_norm": 7.073266506195068,
136
+ "learning_rate": 9.32472691161867e-06,
137
+ "loss": 0.7248,
138
+ "step": 340
139
+ },
140
+ {
141
+ "epoch": 0.48747461069735953,
142
+ "grad_norm": 6.335423469543457,
143
+ "learning_rate": 9.22542204568024e-06,
144
+ "loss": 0.7151,
145
+ "step": 360
146
+ },
147
+ {
148
+ "epoch": 0.5145565335138795,
149
+ "grad_norm": 6.936922550201416,
150
+ "learning_rate": 9.126117179741808e-06,
151
+ "loss": 0.6881,
152
+ "step": 380
153
+ },
154
+ {
155
+ "epoch": 0.5416384563303994,
156
+ "grad_norm": 7.596807479858398,
157
+ "learning_rate": 9.026812313803377e-06,
158
+ "loss": 0.6995,
159
+ "step": 400
160
+ },
161
+ {
162
+ "epoch": 0.5416384563303994,
163
+ "eval_loss": 0.6352065801620483,
164
+ "eval_runtime": 1597.3635,
165
+ "eval_samples_per_second": 3.697,
166
+ "eval_steps_per_second": 0.463,
167
+ "eval_wer": 51.63653601672089,
168
+ "step": 400
169
+ },
170
+ {
171
+ "epoch": 0.5687203791469194,
172
+ "grad_norm": 6.981812477111816,
173
+ "learning_rate": 8.927507447864945e-06,
174
+ "loss": 0.6795,
175
+ "step": 420
176
+ },
177
+ {
178
+ "epoch": 0.5958023019634394,
179
+ "grad_norm": 6.481506824493408,
180
+ "learning_rate": 8.828202581926516e-06,
181
+ "loss": 0.6643,
182
+ "step": 440
183
+ },
184
+ {
185
+ "epoch": 0.6228842247799594,
186
+ "grad_norm": 6.537086009979248,
187
+ "learning_rate": 8.728897715988084e-06,
188
+ "loss": 0.6459,
189
+ "step": 460
190
+ },
191
+ {
192
+ "epoch": 0.6499661475964793,
193
+ "grad_norm": 6.739567756652832,
194
+ "learning_rate": 8.629592850049653e-06,
195
+ "loss": 0.6736,
196
+ "step": 480
197
+ },
198
+ {
199
+ "epoch": 0.6770480704129993,
200
+ "grad_norm": 7.422546863555908,
201
+ "learning_rate": 8.530287984111221e-06,
202
+ "loss": 0.6591,
203
+ "step": 500
204
+ },
205
+ {
206
+ "epoch": 0.7041299932295193,
207
+ "grad_norm": 5.7051215171813965,
208
+ "learning_rate": 8.430983118172792e-06,
209
+ "loss": 0.6442,
210
+ "step": 520
211
+ },
212
+ {
213
+ "epoch": 0.7312119160460393,
214
+ "grad_norm": 7.166143417358398,
215
+ "learning_rate": 8.33167825223436e-06,
216
+ "loss": 0.6326,
217
+ "step": 540
218
+ },
219
+ {
220
+ "epoch": 0.7582938388625592,
221
+ "grad_norm": 7.759460926055908,
222
+ "learning_rate": 8.232373386295929e-06,
223
+ "loss": 0.6328,
224
+ "step": 560
225
+ },
226
+ {
227
+ "epoch": 0.7853757616790792,
228
+ "grad_norm": 5.876537799835205,
229
+ "learning_rate": 8.133068520357497e-06,
230
+ "loss": 0.5945,
231
+ "step": 580
232
+ },
233
+ {
234
+ "epoch": 0.8124576844955992,
235
+ "grad_norm": 6.475106716156006,
236
+ "learning_rate": 8.033763654419066e-06,
237
+ "loss": 0.6038,
238
+ "step": 600
239
+ },
240
+ {
241
+ "epoch": 0.8124576844955992,
242
+ "eval_loss": 0.5318673849105835,
243
+ "eval_runtime": 1614.0202,
244
+ "eval_samples_per_second": 3.659,
245
+ "eval_steps_per_second": 0.458,
246
+ "eval_wer": 44.61419121291129,
247
+ "step": 600
248
+ },
249
+ {
250
+ "epoch": 0.8395396073121192,
251
+ "grad_norm": 6.8348259925842285,
252
+ "learning_rate": 7.934458788480636e-06,
253
+ "loss": 0.6046,
254
+ "step": 620
255
+ },
256
+ {
257
+ "epoch": 0.8666215301286392,
258
+ "grad_norm": 6.060784339904785,
259
+ "learning_rate": 7.835153922542206e-06,
260
+ "loss": 0.6055,
261
+ "step": 640
262
+ },
263
+ {
264
+ "epoch": 0.8937034529451591,
265
+ "grad_norm": 6.01262903213501,
266
+ "learning_rate": 7.735849056603775e-06,
267
+ "loss": 0.6139,
268
+ "step": 660
269
+ },
270
+ {
271
+ "epoch": 0.9207853757616791,
272
+ "grad_norm": 6.706854820251465,
273
+ "learning_rate": 7.636544190665344e-06,
274
+ "loss": 0.5789,
275
+ "step": 680
276
+ },
277
+ {
278
+ "epoch": 0.9478672985781991,
279
+ "grad_norm": 6.464681625366211,
280
+ "learning_rate": 7.537239324726913e-06,
281
+ "loss": 0.5857,
282
+ "step": 700
283
+ },
284
+ {
285
+ "epoch": 0.9749492213947191,
286
+ "grad_norm": 7.8035478591918945,
287
+ "learning_rate": 7.437934458788482e-06,
288
+ "loss": 0.5739,
289
+ "step": 720
290
+ },
291
+ {
292
+ "epoch": 1.001354096140826,
293
+ "grad_norm": 5.574532985687256,
294
+ "learning_rate": 7.33862959285005e-06,
295
+ "loss": 0.5443,
296
+ "step": 740
297
+ },
298
+ {
299
+ "epoch": 1.028436018957346,
300
+ "grad_norm": 5.275153160095215,
301
+ "learning_rate": 7.23932472691162e-06,
302
+ "loss": 0.4705,
303
+ "step": 760
304
+ },
305
+ {
306
+ "epoch": 1.055517941773866,
307
+ "grad_norm": 5.722527027130127,
308
+ "learning_rate": 7.140019860973188e-06,
309
+ "loss": 0.4692,
310
+ "step": 780
311
+ },
312
+ {
313
+ "epoch": 1.0825998645903858,
314
+ "grad_norm": 5.501762866973877,
315
+ "learning_rate": 7.040714995034758e-06,
316
+ "loss": 0.4791,
317
+ "step": 800
318
+ },
319
+ {
320
+ "epoch": 1.0825998645903858,
321
+ "eval_loss": 0.46895861625671387,
322
+ "eval_runtime": 1599.6376,
323
+ "eval_samples_per_second": 3.691,
324
+ "eval_steps_per_second": 0.462,
325
+ "eval_wer": 40.9726561658299,
326
+ "step": 800
327
+ },
328
+ {
329
+ "epoch": 1.1096817874069058,
330
+ "grad_norm": 5.207645893096924,
331
+ "learning_rate": 6.941410129096326e-06,
332
+ "loss": 0.4569,
333
+ "step": 820
334
+ },
335
+ {
336
+ "epoch": 1.1367637102234258,
337
+ "grad_norm": 5.584475517272949,
338
+ "learning_rate": 6.842105263157896e-06,
339
+ "loss": 0.4262,
340
+ "step": 840
341
+ },
342
+ {
343
+ "epoch": 1.1638456330399458,
344
+ "grad_norm": 5.881894588470459,
345
+ "learning_rate": 6.742800397219464e-06,
346
+ "loss": 0.4533,
347
+ "step": 860
348
+ },
349
+ {
350
+ "epoch": 1.1909275558564658,
351
+ "grad_norm": 6.939334869384766,
352
+ "learning_rate": 6.643495531281034e-06,
353
+ "loss": 0.4675,
354
+ "step": 880
355
+ },
356
+ {
357
+ "epoch": 1.2180094786729858,
358
+ "grad_norm": 5.780360698699951,
359
+ "learning_rate": 6.544190665342602e-06,
360
+ "loss": 0.4238,
361
+ "step": 900
362
+ },
363
+ {
364
+ "epoch": 1.2450914014895058,
365
+ "grad_norm": 5.867160797119141,
366
+ "learning_rate": 6.444885799404172e-06,
367
+ "loss": 0.444,
368
+ "step": 920
369
+ },
370
+ {
371
+ "epoch": 1.2721733243060258,
372
+ "grad_norm": 6.121824741363525,
373
+ "learning_rate": 6.34558093346574e-06,
374
+ "loss": 0.4333,
375
+ "step": 940
376
+ },
377
+ {
378
+ "epoch": 1.2992552471225456,
379
+ "grad_norm": 5.132157802581787,
380
+ "learning_rate": 6.24627606752731e-06,
381
+ "loss": 0.4544,
382
+ "step": 960
383
+ },
384
+ {
385
+ "epoch": 1.3263371699390656,
386
+ "grad_norm": 6.384315013885498,
387
+ "learning_rate": 6.146971201588878e-06,
388
+ "loss": 0.4332,
389
+ "step": 980
390
+ },
391
+ {
392
+ "epoch": 1.3534190927555856,
393
+ "grad_norm": 4.54695987701416,
394
+ "learning_rate": 6.047666335650447e-06,
395
+ "loss": 0.4416,
396
+ "step": 1000
397
+ },
398
+ {
399
+ "epoch": 1.3534190927555856,
400
+ "eval_loss": 0.43043428659439087,
401
+ "eval_runtime": 1614.3899,
402
+ "eval_samples_per_second": 3.658,
403
+ "eval_steps_per_second": 0.458,
404
+ "eval_wer": 38.00771402098731,
405
+ "step": 1000
406
+ },
407
+ {
408
+ "epoch": 1.3805010155721056,
409
+ "grad_norm": 4.973900318145752,
410
+ "learning_rate": 5.948361469712016e-06,
411
+ "loss": 0.4401,
412
+ "step": 1020
413
+ },
414
+ {
415
+ "epoch": 1.4075829383886256,
416
+ "grad_norm": 5.952788352966309,
417
+ "learning_rate": 5.849056603773585e-06,
418
+ "loss": 0.4865,
419
+ "step": 1040
420
+ },
421
+ {
422
+ "epoch": 1.4346648612051456,
423
+ "grad_norm": 6.601942539215088,
424
+ "learning_rate": 5.749751737835154e-06,
425
+ "loss": 0.4435,
426
+ "step": 1060
427
+ },
428
+ {
429
+ "epoch": 1.4617467840216656,
430
+ "grad_norm": 6.17143440246582,
431
+ "learning_rate": 5.650446871896723e-06,
432
+ "loss": 0.451,
433
+ "step": 1080
434
+ },
435
+ {
436
+ "epoch": 1.4888287068381856,
437
+ "grad_norm": 5.782886981964111,
438
+ "learning_rate": 5.551142005958292e-06,
439
+ "loss": 0.4311,
440
+ "step": 1100
441
+ },
442
+ {
443
+ "epoch": 1.5159106296547056,
444
+ "grad_norm": 5.734127998352051,
445
+ "learning_rate": 5.451837140019861e-06,
446
+ "loss": 0.4632,
447
+ "step": 1120
448
+ },
449
+ {
450
+ "epoch": 1.5429925524712256,
451
+ "grad_norm": 5.601761341094971,
452
+ "learning_rate": 5.35253227408143e-06,
453
+ "loss": 0.424,
454
+ "step": 1140
455
+ },
456
+ {
457
+ "epoch": 1.5700744752877456,
458
+ "grad_norm": 5.866110801696777,
459
+ "learning_rate": 5.253227408142999e-06,
460
+ "loss": 0.4273,
461
+ "step": 1160
462
+ },
463
+ {
464
+ "epoch": 1.5971563981042654,
465
+ "grad_norm": 5.120361328125,
466
+ "learning_rate": 5.153922542204568e-06,
467
+ "loss": 0.4297,
468
+ "step": 1180
469
+ },
470
+ {
471
+ "epoch": 1.6242383209207854,
472
+ "grad_norm": 5.093082427978516,
473
+ "learning_rate": 5.054617676266137e-06,
474
+ "loss": 0.4321,
475
+ "step": 1200
476
+ },
477
+ {
478
+ "epoch": 1.6242383209207854,
479
+ "eval_loss": 0.3993258476257324,
480
+ "eval_runtime": 1619.3754,
481
+ "eval_samples_per_second": 3.646,
482
+ "eval_steps_per_second": 0.456,
483
+ "eval_wer": 37.091943373052636,
484
+ "step": 1200
485
+ },
486
+ {
487
+ "epoch": 1.6513202437373053,
488
+ "grad_norm": 5.076949119567871,
489
+ "learning_rate": 4.955312810327706e-06,
490
+ "loss": 0.4446,
491
+ "step": 1220
492
+ },
493
+ {
494
+ "epoch": 1.6784021665538253,
495
+ "grad_norm": 5.729442596435547,
496
+ "learning_rate": 4.856007944389276e-06,
497
+ "loss": 0.4375,
498
+ "step": 1240
499
+ },
500
+ {
501
+ "epoch": 1.7054840893703453,
502
+ "grad_norm": 5.660022258758545,
503
+ "learning_rate": 4.756703078450844e-06,
504
+ "loss": 0.4318,
505
+ "step": 1260
506
+ },
507
+ {
508
+ "epoch": 1.7325660121868651,
509
+ "grad_norm": 5.320847511291504,
510
+ "learning_rate": 4.657398212512414e-06,
511
+ "loss": 0.4255,
512
+ "step": 1280
513
+ },
514
+ {
515
+ "epoch": 1.7596479350033851,
516
+ "grad_norm": 4.652419567108154,
517
+ "learning_rate": 4.558093346573982e-06,
518
+ "loss": 0.4231,
519
+ "step": 1300
520
+ },
521
+ {
522
+ "epoch": 1.7867298578199051,
523
+ "grad_norm": 6.301181793212891,
524
+ "learning_rate": 4.458788480635551e-06,
525
+ "loss": 0.4426,
526
+ "step": 1320
527
+ },
528
+ {
529
+ "epoch": 1.8138117806364251,
530
+ "grad_norm": 5.540495872497559,
531
+ "learning_rate": 4.35948361469712e-06,
532
+ "loss": 0.4361,
533
+ "step": 1340
534
+ },
535
+ {
536
+ "epoch": 1.8408937034529451,
537
+ "grad_norm": 5.618797302246094,
538
+ "learning_rate": 4.260178748758689e-06,
539
+ "loss": 0.4081,
540
+ "step": 1360
541
+ },
542
+ {
543
+ "epoch": 1.8679756262694651,
544
+ "grad_norm": 5.519278526306152,
545
+ "learning_rate": 4.160873882820258e-06,
546
+ "loss": 0.4311,
547
+ "step": 1380
548
+ },
549
+ {
550
+ "epoch": 1.8950575490859851,
551
+ "grad_norm": 5.24718713760376,
552
+ "learning_rate": 4.061569016881828e-06,
553
+ "loss": 0.441,
554
+ "step": 1400
555
+ },
556
+ {
557
+ "epoch": 1.8950575490859851,
558
+ "eval_loss": 0.37384942173957825,
559
+ "eval_runtime": 1616.0813,
560
+ "eval_samples_per_second": 3.654,
561
+ "eval_steps_per_second": 0.457,
562
+ "eval_wer": 35.20006895214291,
563
+ "step": 1400
564
+ }
565
+ ],
566
+ "logging_steps": 20,
567
+ "max_steps": 2214,
568
+ "num_input_tokens_seen": 0,
569
+ "num_train_epochs": 3,
570
+ "save_steps": 200,
571
+ "stateful_callbacks": {
572
+ "TrainerControl": {
573
+ "args": {
574
+ "should_epoch_stop": false,
575
+ "should_evaluate": false,
576
+ "should_log": false,
577
+ "should_save": true,
578
+ "should_training_stop": false
579
+ },
580
+ "attributes": {}
581
+ }
582
+ },
583
+ "total_flos": 1.292054554312704e+19,
584
+ "train_batch_size": 16,
585
+ "trial_name": null,
586
+ "trial_params": null
587
+ }
checkpoint-1400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d182356a12cee606170360f976946e1a56ce9d494fc1de269ffa184a80a19aa
3
+ size 5432
checkpoint-1600/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": null,
10
+ "bos_token_id": 50257,
11
+ "classifier_proj_size": 256,
12
+ "d_model": 768,
13
+ "decoder_attention_heads": 12,
14
+ "decoder_ffn_dim": 3072,
15
+ "decoder_layerdrop": 0.0,
16
+ "decoder_layers": 12,
17
+ "decoder_start_token_id": 50258,
18
+ "dropout": 0.0,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 12,
23
+ "eos_token_id": 50257,
24
+ "forced_decoder_ids": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "mask_feature_length": 10,
28
+ "mask_feature_min_masks": 0,
29
+ "mask_feature_prob": 0.0,
30
+ "mask_time_length": 10,
31
+ "mask_time_min_masks": 2,
32
+ "mask_time_prob": 0.05,
33
+ "max_length": null,
34
+ "max_source_positions": 1500,
35
+ "max_target_positions": 448,
36
+ "median_filter_width": 7,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 12,
39
+ "num_mel_bins": 80,
40
+ "pad_token_id": 50257,
41
+ "scale_embedding": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_cache": false,
45
+ "use_weighted_layer_sum": false,
46
+ "vocab_size": 51865
47
+ }
checkpoint-1600/generation_config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 5,
5
+ 3
6
+ ],
7
+ [
8
+ 5,
9
+ 9
10
+ ],
11
+ [
12
+ 8,
13
+ 0
14
+ ],
15
+ [
16
+ 8,
17
+ 4
18
+ ],
19
+ [
20
+ 8,
21
+ 7
22
+ ],
23
+ [
24
+ 8,
25
+ 8
26
+ ],
27
+ [
28
+ 9,
29
+ 0
30
+ ],
31
+ [
32
+ 9,
33
+ 7
34
+ ],
35
+ [
36
+ 9,
37
+ 9
38
+ ],
39
+ [
40
+ 10,
41
+ 5
42
+ ]
43
+ ],
44
+ "begin_suppress_tokens": [
45
+ 220,
46
+ 50257
47
+ ],
48
+ "bos_token_id": 50257,
49
+ "decoder_start_token_id": 50258,
50
+ "eos_token_id": 50257,
51
+ "forced_decoder_ids": [
52
+ [
53
+ 1,
54
+ null
55
+ ],
56
+ [
57
+ 2,
58
+ 50359
59
+ ]
60
+ ],
61
+ "is_multilingual": true,
62
+ "lang_to_id": {
63
+ "<|af|>": 50327,
64
+ "<|am|>": 50334,
65
+ "<|ar|>": 50272,
66
+ "<|as|>": 50350,
67
+ "<|az|>": 50304,
68
+ "<|ba|>": 50355,
69
+ "<|be|>": 50330,
70
+ "<|bg|>": 50292,
71
+ "<|bn|>": 50302,
72
+ "<|bo|>": 50347,
73
+ "<|br|>": 50309,
74
+ "<|bs|>": 50315,
75
+ "<|ca|>": 50270,
76
+ "<|cs|>": 50283,
77
+ "<|cy|>": 50297,
78
+ "<|da|>": 50285,
79
+ "<|de|>": 50261,
80
+ "<|el|>": 50281,
81
+ "<|en|>": 50259,
82
+ "<|es|>": 50262,
83
+ "<|et|>": 50307,
84
+ "<|eu|>": 50310,
85
+ "<|fa|>": 50300,
86
+ "<|fi|>": 50277,
87
+ "<|fo|>": 50338,
88
+ "<|fr|>": 50265,
89
+ "<|gl|>": 50319,
90
+ "<|gu|>": 50333,
91
+ "<|haw|>": 50352,
92
+ "<|ha|>": 50354,
93
+ "<|he|>": 50279,
94
+ "<|hi|>": 50276,
95
+ "<|hr|>": 50291,
96
+ "<|ht|>": 50339,
97
+ "<|hu|>": 50286,
98
+ "<|hy|>": 50312,
99
+ "<|id|>": 50275,
100
+ "<|is|>": 50311,
101
+ "<|it|>": 50274,
102
+ "<|ja|>": 50266,
103
+ "<|jw|>": 50356,
104
+ "<|ka|>": 50329,
105
+ "<|kk|>": 50316,
106
+ "<|km|>": 50323,
107
+ "<|kn|>": 50306,
108
+ "<|ko|>": 50264,
109
+ "<|la|>": 50294,
110
+ "<|lb|>": 50345,
111
+ "<|ln|>": 50353,
112
+ "<|lo|>": 50336,
113
+ "<|lt|>": 50293,
114
+ "<|lv|>": 50301,
115
+ "<|mg|>": 50349,
116
+ "<|mi|>": 50295,
117
+ "<|mk|>": 50308,
118
+ "<|ml|>": 50296,
119
+ "<|mn|>": 50314,
120
+ "<|mr|>": 50320,
121
+ "<|ms|>": 50282,
122
+ "<|mt|>": 50343,
123
+ "<|my|>": 50346,
124
+ "<|ne|>": 50313,
125
+ "<|nl|>": 50271,
126
+ "<|nn|>": 50342,
127
+ "<|no|>": 50288,
128
+ "<|oc|>": 50328,
129
+ "<|pa|>": 50321,
130
+ "<|pl|>": 50269,
131
+ "<|ps|>": 50340,
132
+ "<|pt|>": 50267,
133
+ "<|ro|>": 50284,
134
+ "<|ru|>": 50263,
135
+ "<|sa|>": 50344,
136
+ "<|sd|>": 50332,
137
+ "<|si|>": 50322,
138
+ "<|sk|>": 50298,
139
+ "<|sl|>": 50305,
140
+ "<|sn|>": 50324,
141
+ "<|so|>": 50326,
142
+ "<|sq|>": 50317,
143
+ "<|sr|>": 50303,
144
+ "<|su|>": 50357,
145
+ "<|sv|>": 50273,
146
+ "<|sw|>": 50318,
147
+ "<|ta|>": 50287,
148
+ "<|te|>": 50299,
149
+ "<|tg|>": 50331,
150
+ "<|th|>": 50289,
151
+ "<|tk|>": 50341,
152
+ "<|tl|>": 50348,
153
+ "<|tr|>": 50268,
154
+ "<|tt|>": 50351,
155
+ "<|uk|>": 50280,
156
+ "<|ur|>": 50290,
157
+ "<|uz|>": 50337,
158
+ "<|vi|>": 50278,
159
+ "<|yi|>": 50335,
160
+ "<|yo|>": 50325,
161
+ "<|zh|>": 50260
162
+ },
163
+ "max_initial_timestamp_index": 50,
164
+ "max_length": 448,
165
+ "no_timestamps_token_id": 50363,
166
+ "pad_token_id": 50257,
167
+ "prev_sot_token_id": 50361,
168
+ "return_timestamps": false,
169
+ "suppress_tokens": [],
170
+ "task_to_id": {
171
+ "transcribe": 50359,
172
+ "translate": 50358
173
+ },
174
+ "transformers_version": "4.51.3"
175
+ }
checkpoint-1600/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:195e815a216c95f9715350995a8ec27eb0a0e4dc4ab78fcc4854e60db7b43a2a
3
+ size 966995080
checkpoint-1600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e501476dad50a1fa900e1dc3dfb6488a6f96ba0591c8178cd9fc54e4b72520ba
3
+ size 1925064044
checkpoint-1600/preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
checkpoint-1600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de6548641f28651640920c1bf324e1f8b9a6815899170d00a5e76bb82579d107
3
+ size 14244
checkpoint-1600/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62209be68631fc121c2385c081411b86d2ac6a6c67d45683880f6163d554ca03
3
+ size 988
checkpoint-1600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11c00ff382b63572e9d003a78ef65a7346daf9110b160d2871e9936d362aeecf
3
+ size 1064
checkpoint-1600/trainer_state.json ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1600,
3
+ "best_metric": 34.23258419703075,
4
+ "best_model_checkpoint": "./HAUSA_B/checkpoint-1600",
5
+ "epoch": 2.1651997291807716,
6
+ "eval_steps": 200,
7
+ "global_step": 1600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.027081922816519974,
14
+ "grad_norm": 106.59215545654297,
15
+ "learning_rate": 8.000000000000001e-07,
16
+ "loss": 6.1915,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.05416384563303995,
21
+ "grad_norm": 30.563518524169922,
22
+ "learning_rate": 1.8000000000000001e-06,
23
+ "loss": 4.5886,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.08124576844955991,
28
+ "grad_norm": 16.528457641601562,
29
+ "learning_rate": 2.8000000000000003e-06,
30
+ "loss": 3.0338,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.1083276912660799,
35
+ "grad_norm": 13.852096557617188,
36
+ "learning_rate": 3.8000000000000005e-06,
37
+ "loss": 2.3188,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.13540961408259986,
42
+ "grad_norm": 13.284646987915039,
43
+ "learning_rate": 4.800000000000001e-06,
44
+ "loss": 1.9725,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.16249153689911983,
49
+ "grad_norm": 13.212055206298828,
50
+ "learning_rate": 5.8e-06,
51
+ "loss": 1.7839,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.1895734597156398,
56
+ "grad_norm": 12.006990432739258,
57
+ "learning_rate": 6.800000000000001e-06,
58
+ "loss": 1.5991,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.2166553825321598,
63
+ "grad_norm": 12.490514755249023,
64
+ "learning_rate": 7.800000000000002e-06,
65
+ "loss": 1.4411,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.24373730534867977,
70
+ "grad_norm": 11.587646484375,
71
+ "learning_rate": 8.8e-06,
72
+ "loss": 1.3487,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.2708192281651997,
77
+ "grad_norm": 11.574933052062988,
78
+ "learning_rate": 9.800000000000001e-06,
79
+ "loss": 1.2119,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.2708192281651997,
84
+ "eval_loss": 1.1157827377319336,
85
+ "eval_runtime": 1601.2991,
86
+ "eval_samples_per_second": 3.688,
87
+ "eval_steps_per_second": 0.462,
88
+ "eval_wer": 64.22461160550755,
89
+ "step": 200
90
+ },
91
+ {
92
+ "epoch": 0.2979011509817197,
93
+ "grad_norm": 8.368565559387207,
94
+ "learning_rate": 9.920556107249256e-06,
95
+ "loss": 0.9452,
96
+ "step": 220
97
+ },
98
+ {
99
+ "epoch": 0.32498307379823965,
100
+ "grad_norm": 7.2616682052612305,
101
+ "learning_rate": 9.821251241310825e-06,
102
+ "loss": 0.9298,
103
+ "step": 240
104
+ },
105
+ {
106
+ "epoch": 0.35206499661475965,
107
+ "grad_norm": 7.266394138336182,
108
+ "learning_rate": 9.721946375372395e-06,
109
+ "loss": 0.8567,
110
+ "step": 260
111
+ },
112
+ {
113
+ "epoch": 0.3791469194312796,
114
+ "grad_norm": 8.120067596435547,
115
+ "learning_rate": 9.622641509433963e-06,
116
+ "loss": 0.8251,
117
+ "step": 280
118
+ },
119
+ {
120
+ "epoch": 0.4062288422477996,
121
+ "grad_norm": 7.483547687530518,
122
+ "learning_rate": 9.523336643495532e-06,
123
+ "loss": 0.7774,
124
+ "step": 300
125
+ },
126
+ {
127
+ "epoch": 0.4333107650643196,
128
+ "grad_norm": 7.521896839141846,
129
+ "learning_rate": 9.4240317775571e-06,
130
+ "loss": 0.7266,
131
+ "step": 320
132
+ },
133
+ {
134
+ "epoch": 0.46039268788083954,
135
+ "grad_norm": 7.073266506195068,
136
+ "learning_rate": 9.32472691161867e-06,
137
+ "loss": 0.7248,
138
+ "step": 340
139
+ },
140
+ {
141
+ "epoch": 0.48747461069735953,
142
+ "grad_norm": 6.335423469543457,
143
+ "learning_rate": 9.22542204568024e-06,
144
+ "loss": 0.7151,
145
+ "step": 360
146
+ },
147
+ {
148
+ "epoch": 0.5145565335138795,
149
+ "grad_norm": 6.936922550201416,
150
+ "learning_rate": 9.126117179741808e-06,
151
+ "loss": 0.6881,
152
+ "step": 380
153
+ },
154
+ {
155
+ "epoch": 0.5416384563303994,
156
+ "grad_norm": 7.596807479858398,
157
+ "learning_rate": 9.026812313803377e-06,
158
+ "loss": 0.6995,
159
+ "step": 400
160
+ },
161
+ {
162
+ "epoch": 0.5416384563303994,
163
+ "eval_loss": 0.6352065801620483,
164
+ "eval_runtime": 1597.3635,
165
+ "eval_samples_per_second": 3.697,
166
+ "eval_steps_per_second": 0.463,
167
+ "eval_wer": 51.63653601672089,
168
+ "step": 400
169
+ },
170
+ {
171
+ "epoch": 0.5687203791469194,
172
+ "grad_norm": 6.981812477111816,
173
+ "learning_rate": 8.927507447864945e-06,
174
+ "loss": 0.6795,
175
+ "step": 420
176
+ },
177
+ {
178
+ "epoch": 0.5958023019634394,
179
+ "grad_norm": 6.481506824493408,
180
+ "learning_rate": 8.828202581926516e-06,
181
+ "loss": 0.6643,
182
+ "step": 440
183
+ },
184
+ {
185
+ "epoch": 0.6228842247799594,
186
+ "grad_norm": 6.537086009979248,
187
+ "learning_rate": 8.728897715988084e-06,
188
+ "loss": 0.6459,
189
+ "step": 460
190
+ },
191
+ {
192
+ "epoch": 0.6499661475964793,
193
+ "grad_norm": 6.739567756652832,
194
+ "learning_rate": 8.629592850049653e-06,
195
+ "loss": 0.6736,
196
+ "step": 480
197
+ },
198
+ {
199
+ "epoch": 0.6770480704129993,
200
+ "grad_norm": 7.422546863555908,
201
+ "learning_rate": 8.530287984111221e-06,
202
+ "loss": 0.6591,
203
+ "step": 500
204
+ },
205
+ {
206
+ "epoch": 0.7041299932295193,
207
+ "grad_norm": 5.7051215171813965,
208
+ "learning_rate": 8.430983118172792e-06,
209
+ "loss": 0.6442,
210
+ "step": 520
211
+ },
212
+ {
213
+ "epoch": 0.7312119160460393,
214
+ "grad_norm": 7.166143417358398,
215
+ "learning_rate": 8.33167825223436e-06,
216
+ "loss": 0.6326,
217
+ "step": 540
218
+ },
219
+ {
220
+ "epoch": 0.7582938388625592,
221
+ "grad_norm": 7.759460926055908,
222
+ "learning_rate": 8.232373386295929e-06,
223
+ "loss": 0.6328,
224
+ "step": 560
225
+ },
226
+ {
227
+ "epoch": 0.7853757616790792,
228
+ "grad_norm": 5.876537799835205,
229
+ "learning_rate": 8.133068520357497e-06,
230
+ "loss": 0.5945,
231
+ "step": 580
232
+ },
233
+ {
234
+ "epoch": 0.8124576844955992,
235
+ "grad_norm": 6.475106716156006,
236
+ "learning_rate": 8.033763654419066e-06,
237
+ "loss": 0.6038,
238
+ "step": 600
239
+ },
240
+ {
241
+ "epoch": 0.8124576844955992,
242
+ "eval_loss": 0.5318673849105835,
243
+ "eval_runtime": 1614.0202,
244
+ "eval_samples_per_second": 3.659,
245
+ "eval_steps_per_second": 0.458,
246
+ "eval_wer": 44.61419121291129,
247
+ "step": 600
248
+ },
249
+ {
250
+ "epoch": 0.8395396073121192,
251
+ "grad_norm": 6.8348259925842285,
252
+ "learning_rate": 7.934458788480636e-06,
253
+ "loss": 0.6046,
254
+ "step": 620
255
+ },
256
+ {
257
+ "epoch": 0.8666215301286392,
258
+ "grad_norm": 6.060784339904785,
259
+ "learning_rate": 7.835153922542206e-06,
260
+ "loss": 0.6055,
261
+ "step": 640
262
+ },
263
+ {
264
+ "epoch": 0.8937034529451591,
265
+ "grad_norm": 6.01262903213501,
266
+ "learning_rate": 7.735849056603775e-06,
267
+ "loss": 0.6139,
268
+ "step": 660
269
+ },
270
+ {
271
+ "epoch": 0.9207853757616791,
272
+ "grad_norm": 6.706854820251465,
273
+ "learning_rate": 7.636544190665344e-06,
274
+ "loss": 0.5789,
275
+ "step": 680
276
+ },
277
+ {
278
+ "epoch": 0.9478672985781991,
279
+ "grad_norm": 6.464681625366211,
280
+ "learning_rate": 7.537239324726913e-06,
281
+ "loss": 0.5857,
282
+ "step": 700
283
+ },
284
+ {
285
+ "epoch": 0.9749492213947191,
286
+ "grad_norm": 7.8035478591918945,
287
+ "learning_rate": 7.437934458788482e-06,
288
+ "loss": 0.5739,
289
+ "step": 720
290
+ },
291
+ {
292
+ "epoch": 1.001354096140826,
293
+ "grad_norm": 5.574532985687256,
294
+ "learning_rate": 7.33862959285005e-06,
295
+ "loss": 0.5443,
296
+ "step": 740
297
+ },
298
+ {
299
+ "epoch": 1.028436018957346,
300
+ "grad_norm": 5.275153160095215,
301
+ "learning_rate": 7.23932472691162e-06,
302
+ "loss": 0.4705,
303
+ "step": 760
304
+ },
305
+ {
306
+ "epoch": 1.055517941773866,
307
+ "grad_norm": 5.722527027130127,
308
+ "learning_rate": 7.140019860973188e-06,
309
+ "loss": 0.4692,
310
+ "step": 780
311
+ },
312
+ {
313
+ "epoch": 1.0825998645903858,
314
+ "grad_norm": 5.501762866973877,
315
+ "learning_rate": 7.040714995034758e-06,
316
+ "loss": 0.4791,
317
+ "step": 800
318
+ },
319
+ {
320
+ "epoch": 1.0825998645903858,
321
+ "eval_loss": 0.46895861625671387,
322
+ "eval_runtime": 1599.6376,
323
+ "eval_samples_per_second": 3.691,
324
+ "eval_steps_per_second": 0.462,
325
+ "eval_wer": 40.9726561658299,
326
+ "step": 800
327
+ },
328
+ {
329
+ "epoch": 1.1096817874069058,
330
+ "grad_norm": 5.207645893096924,
331
+ "learning_rate": 6.941410129096326e-06,
332
+ "loss": 0.4569,
333
+ "step": 820
334
+ },
335
+ {
336
+ "epoch": 1.1367637102234258,
337
+ "grad_norm": 5.584475517272949,
338
+ "learning_rate": 6.842105263157896e-06,
339
+ "loss": 0.4262,
340
+ "step": 840
341
+ },
342
+ {
343
+ "epoch": 1.1638456330399458,
344
+ "grad_norm": 5.881894588470459,
345
+ "learning_rate": 6.742800397219464e-06,
346
+ "loss": 0.4533,
347
+ "step": 860
348
+ },
349
+ {
350
+ "epoch": 1.1909275558564658,
351
+ "grad_norm": 6.939334869384766,
352
+ "learning_rate": 6.643495531281034e-06,
353
+ "loss": 0.4675,
354
+ "step": 880
355
+ },
356
+ {
357
+ "epoch": 1.2180094786729858,
358
+ "grad_norm": 5.780360698699951,
359
+ "learning_rate": 6.544190665342602e-06,
360
+ "loss": 0.4238,
361
+ "step": 900
362
+ },
363
+ {
364
+ "epoch": 1.2450914014895058,
365
+ "grad_norm": 5.867160797119141,
366
+ "learning_rate": 6.444885799404172e-06,
367
+ "loss": 0.444,
368
+ "step": 920
369
+ },
370
+ {
371
+ "epoch": 1.2721733243060258,
372
+ "grad_norm": 6.121824741363525,
373
+ "learning_rate": 6.34558093346574e-06,
374
+ "loss": 0.4333,
375
+ "step": 940
376
+ },
377
+ {
378
+ "epoch": 1.2992552471225456,
379
+ "grad_norm": 5.132157802581787,
380
+ "learning_rate": 6.24627606752731e-06,
381
+ "loss": 0.4544,
382
+ "step": 960
383
+ },
384
+ {
385
+ "epoch": 1.3263371699390656,
386
+ "grad_norm": 6.384315013885498,
387
+ "learning_rate": 6.146971201588878e-06,
388
+ "loss": 0.4332,
389
+ "step": 980
390
+ },
391
+ {
392
+ "epoch": 1.3534190927555856,
393
+ "grad_norm": 4.54695987701416,
394
+ "learning_rate": 6.047666335650447e-06,
395
+ "loss": 0.4416,
396
+ "step": 1000
397
+ },
398
+ {
399
+ "epoch": 1.3534190927555856,
400
+ "eval_loss": 0.43043428659439087,
401
+ "eval_runtime": 1614.3899,
402
+ "eval_samples_per_second": 3.658,
403
+ "eval_steps_per_second": 0.458,
404
+ "eval_wer": 38.00771402098731,
405
+ "step": 1000
406
+ },
407
+ {
408
+ "epoch": 1.3805010155721056,
409
+ "grad_norm": 4.973900318145752,
410
+ "learning_rate": 5.948361469712016e-06,
411
+ "loss": 0.4401,
412
+ "step": 1020
413
+ },
414
+ {
415
+ "epoch": 1.4075829383886256,
416
+ "grad_norm": 5.952788352966309,
417
+ "learning_rate": 5.849056603773585e-06,
418
+ "loss": 0.4865,
419
+ "step": 1040
420
+ },
421
+ {
422
+ "epoch": 1.4346648612051456,
423
+ "grad_norm": 6.601942539215088,
424
+ "learning_rate": 5.749751737835154e-06,
425
+ "loss": 0.4435,
426
+ "step": 1060
427
+ },
428
+ {
429
+ "epoch": 1.4617467840216656,
430
+ "grad_norm": 6.17143440246582,
431
+ "learning_rate": 5.650446871896723e-06,
432
+ "loss": 0.451,
433
+ "step": 1080
434
+ },
435
+ {
436
+ "epoch": 1.4888287068381856,
437
+ "grad_norm": 5.782886981964111,
438
+ "learning_rate": 5.551142005958292e-06,
439
+ "loss": 0.4311,
440
+ "step": 1100
441
+ },
442
+ {
443
+ "epoch": 1.5159106296547056,
444
+ "grad_norm": 5.734127998352051,
445
+ "learning_rate": 5.451837140019861e-06,
446
+ "loss": 0.4632,
447
+ "step": 1120
448
+ },
449
+ {
450
+ "epoch": 1.5429925524712256,
451
+ "grad_norm": 5.601761341094971,
452
+ "learning_rate": 5.35253227408143e-06,
453
+ "loss": 0.424,
454
+ "step": 1140
455
+ },
456
+ {
457
+ "epoch": 1.5700744752877456,
458
+ "grad_norm": 5.866110801696777,
459
+ "learning_rate": 5.253227408142999e-06,
460
+ "loss": 0.4273,
461
+ "step": 1160
462
+ },
463
+ {
464
+ "epoch": 1.5971563981042654,
465
+ "grad_norm": 5.120361328125,
466
+ "learning_rate": 5.153922542204568e-06,
467
+ "loss": 0.4297,
468
+ "step": 1180
469
+ },
470
+ {
471
+ "epoch": 1.6242383209207854,
472
+ "grad_norm": 5.093082427978516,
473
+ "learning_rate": 5.054617676266137e-06,
474
+ "loss": 0.4321,
475
+ "step": 1200
476
+ },
477
+ {
478
+ "epoch": 1.6242383209207854,
479
+ "eval_loss": 0.3993258476257324,
480
+ "eval_runtime": 1619.3754,
481
+ "eval_samples_per_second": 3.646,
482
+ "eval_steps_per_second": 0.456,
483
+ "eval_wer": 37.091943373052636,
484
+ "step": 1200
485
+ },
486
+ {
487
+ "epoch": 1.6513202437373053,
488
+ "grad_norm": 5.076949119567871,
489
+ "learning_rate": 4.955312810327706e-06,
490
+ "loss": 0.4446,
491
+ "step": 1220
492
+ },
493
+ {
494
+ "epoch": 1.6784021665538253,
495
+ "grad_norm": 5.729442596435547,
496
+ "learning_rate": 4.856007944389276e-06,
497
+ "loss": 0.4375,
498
+ "step": 1240
499
+ },
500
+ {
501
+ "epoch": 1.7054840893703453,
502
+ "grad_norm": 5.660022258758545,
503
+ "learning_rate": 4.756703078450844e-06,
504
+ "loss": 0.4318,
505
+ "step": 1260
506
+ },
507
+ {
508
+ "epoch": 1.7325660121868651,
509
+ "grad_norm": 5.320847511291504,
510
+ "learning_rate": 4.657398212512414e-06,
511
+ "loss": 0.4255,
512
+ "step": 1280
513
+ },
514
+ {
515
+ "epoch": 1.7596479350033851,
516
+ "grad_norm": 4.652419567108154,
517
+ "learning_rate": 4.558093346573982e-06,
518
+ "loss": 0.4231,
519
+ "step": 1300
520
+ },
521
+ {
522
+ "epoch": 1.7867298578199051,
523
+ "grad_norm": 6.301181793212891,
524
+ "learning_rate": 4.458788480635551e-06,
525
+ "loss": 0.4426,
526
+ "step": 1320
527
+ },
528
+ {
529
+ "epoch": 1.8138117806364251,
530
+ "grad_norm": 5.540495872497559,
531
+ "learning_rate": 4.35948361469712e-06,
532
+ "loss": 0.4361,
533
+ "step": 1340
534
+ },
535
+ {
536
+ "epoch": 1.8408937034529451,
537
+ "grad_norm": 5.618797302246094,
538
+ "learning_rate": 4.260178748758689e-06,
539
+ "loss": 0.4081,
540
+ "step": 1360
541
+ },
542
+ {
543
+ "epoch": 1.8679756262694651,
544
+ "grad_norm": 5.519278526306152,
545
+ "learning_rate": 4.160873882820258e-06,
546
+ "loss": 0.4311,
547
+ "step": 1380
548
+ },
549
+ {
550
+ "epoch": 1.8950575490859851,
551
+ "grad_norm": 5.24718713760376,
552
+ "learning_rate": 4.061569016881828e-06,
553
+ "loss": 0.441,
554
+ "step": 1400
555
+ },
556
+ {
557
+ "epoch": 1.8950575490859851,
558
+ "eval_loss": 0.37384942173957825,
559
+ "eval_runtime": 1616.0813,
560
+ "eval_samples_per_second": 3.654,
561
+ "eval_steps_per_second": 0.457,
562
+ "eval_wer": 35.20006895214291,
563
+ "step": 1400
564
+ },
565
+ {
566
+ "epoch": 1.9221394719025051,
567
+ "grad_norm": 5.9886884689331055,
568
+ "learning_rate": 3.962264150943396e-06,
569
+ "loss": 0.4125,
570
+ "step": 1420
571
+ },
572
+ {
573
+ "epoch": 1.9492213947190251,
574
+ "grad_norm": 5.233187198638916,
575
+ "learning_rate": 3.862959285004966e-06,
576
+ "loss": 0.4179,
577
+ "step": 1440
578
+ },
579
+ {
580
+ "epoch": 1.9763033175355451,
581
+ "grad_norm": 5.533965587615967,
582
+ "learning_rate": 3.763654419066535e-06,
583
+ "loss": 0.4272,
584
+ "step": 1460
585
+ },
586
+ {
587
+ "epoch": 2.002708192281652,
588
+ "grad_norm": 5.179864883422852,
589
+ "learning_rate": 3.664349553128104e-06,
590
+ "loss": 0.4082,
591
+ "step": 1480
592
+ },
593
+ {
594
+ "epoch": 2.029790115098172,
595
+ "grad_norm": 4.748936653137207,
596
+ "learning_rate": 3.565044687189673e-06,
597
+ "loss": 0.3085,
598
+ "step": 1500
599
+ },
600
+ {
601
+ "epoch": 2.056872037914692,
602
+ "grad_norm": 5.267838001251221,
603
+ "learning_rate": 3.4657398212512415e-06,
604
+ "loss": 0.312,
605
+ "step": 1520
606
+ },
607
+ {
608
+ "epoch": 2.083953960731212,
609
+ "grad_norm": 6.315608501434326,
610
+ "learning_rate": 3.3664349553128105e-06,
611
+ "loss": 0.3512,
612
+ "step": 1540
613
+ },
614
+ {
615
+ "epoch": 2.111035883547732,
616
+ "grad_norm": 5.232484340667725,
617
+ "learning_rate": 3.2671300893743795e-06,
618
+ "loss": 0.3184,
619
+ "step": 1560
620
+ },
621
+ {
622
+ "epoch": 2.138117806364252,
623
+ "grad_norm": 4.316751956939697,
624
+ "learning_rate": 3.1678252234359485e-06,
625
+ "loss": 0.3129,
626
+ "step": 1580
627
+ },
628
+ {
629
+ "epoch": 2.1651997291807716,
630
+ "grad_norm": 4.040640354156494,
631
+ "learning_rate": 3.0685203574975175e-06,
632
+ "loss": 0.3073,
633
+ "step": 1600
634
+ },
635
+ {
636
+ "epoch": 2.1651997291807716,
637
+ "eval_loss": 0.3516213595867157,
638
+ "eval_runtime": 1617.0984,
639
+ "eval_samples_per_second": 3.652,
640
+ "eval_steps_per_second": 0.457,
641
+ "eval_wer": 34.23258419703075,
642
+ "step": 1600
643
+ }
644
+ ],
645
+ "logging_steps": 20,
646
+ "max_steps": 2214,
647
+ "num_input_tokens_seen": 0,
648
+ "num_train_epochs": 3,
649
+ "save_steps": 200,
650
+ "stateful_callbacks": {
651
+ "TrainerControl": {
652
+ "args": {
653
+ "should_epoch_stop": false,
654
+ "should_evaluate": false,
655
+ "should_log": false,
656
+ "should_save": true,
657
+ "should_training_stop": false
658
+ },
659
+ "attributes": {}
660
+ }
661
+ },
662
+ "total_flos": 1.475941171396608e+19,
663
+ "train_batch_size": 16,
664
+ "trial_name": null,
665
+ "trial_params": null
666
+ }
checkpoint-1600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d182356a12cee606170360f976946e1a56ce9d494fc1de269ffa184a80a19aa
3
+ size 5432
checkpoint-1800/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": null,
10
+ "bos_token_id": 50257,
11
+ "classifier_proj_size": 256,
12
+ "d_model": 768,
13
+ "decoder_attention_heads": 12,
14
+ "decoder_ffn_dim": 3072,
15
+ "decoder_layerdrop": 0.0,
16
+ "decoder_layers": 12,
17
+ "decoder_start_token_id": 50258,
18
+ "dropout": 0.0,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 12,
23
+ "eos_token_id": 50257,
24
+ "forced_decoder_ids": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "mask_feature_length": 10,
28
+ "mask_feature_min_masks": 0,
29
+ "mask_feature_prob": 0.0,
30
+ "mask_time_length": 10,
31
+ "mask_time_min_masks": 2,
32
+ "mask_time_prob": 0.05,
33
+ "max_length": null,
34
+ "max_source_positions": 1500,
35
+ "max_target_positions": 448,
36
+ "median_filter_width": 7,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 12,
39
+ "num_mel_bins": 80,
40
+ "pad_token_id": 50257,
41
+ "scale_embedding": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_cache": false,
45
+ "use_weighted_layer_sum": false,
46
+ "vocab_size": 51865
47
+ }
checkpoint-1800/generation_config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 5,
5
+ 3
6
+ ],
7
+ [
8
+ 5,
9
+ 9
10
+ ],
11
+ [
12
+ 8,
13
+ 0
14
+ ],
15
+ [
16
+ 8,
17
+ 4
18
+ ],
19
+ [
20
+ 8,
21
+ 7
22
+ ],
23
+ [
24
+ 8,
25
+ 8
26
+ ],
27
+ [
28
+ 9,
29
+ 0
30
+ ],
31
+ [
32
+ 9,
33
+ 7
34
+ ],
35
+ [
36
+ 9,
37
+ 9
38
+ ],
39
+ [
40
+ 10,
41
+ 5
42
+ ]
43
+ ],
44
+ "begin_suppress_tokens": [
45
+ 220,
46
+ 50257
47
+ ],
48
+ "bos_token_id": 50257,
49
+ "decoder_start_token_id": 50258,
50
+ "eos_token_id": 50257,
51
+ "forced_decoder_ids": [
52
+ [
53
+ 1,
54
+ null
55
+ ],
56
+ [
57
+ 2,
58
+ 50359
59
+ ]
60
+ ],
61
+ "is_multilingual": true,
62
+ "lang_to_id": {
63
+ "<|af|>": 50327,
64
+ "<|am|>": 50334,
65
+ "<|ar|>": 50272,
66
+ "<|as|>": 50350,
67
+ "<|az|>": 50304,
68
+ "<|ba|>": 50355,
69
+ "<|be|>": 50330,
70
+ "<|bg|>": 50292,
71
+ "<|bn|>": 50302,
72
+ "<|bo|>": 50347,
73
+ "<|br|>": 50309,
74
+ "<|bs|>": 50315,
75
+ "<|ca|>": 50270,
76
+ "<|cs|>": 50283,
77
+ "<|cy|>": 50297,
78
+ "<|da|>": 50285,
79
+ "<|de|>": 50261,
80
+ "<|el|>": 50281,
81
+ "<|en|>": 50259,
82
+ "<|es|>": 50262,
83
+ "<|et|>": 50307,
84
+ "<|eu|>": 50310,
85
+ "<|fa|>": 50300,
86
+ "<|fi|>": 50277,
87
+ "<|fo|>": 50338,
88
+ "<|fr|>": 50265,
89
+ "<|gl|>": 50319,
90
+ "<|gu|>": 50333,
91
+ "<|haw|>": 50352,
92
+ "<|ha|>": 50354,
93
+ "<|he|>": 50279,
94
+ "<|hi|>": 50276,
95
+ "<|hr|>": 50291,
96
+ "<|ht|>": 50339,
97
+ "<|hu|>": 50286,
98
+ "<|hy|>": 50312,
99
+ "<|id|>": 50275,
100
+ "<|is|>": 50311,
101
+ "<|it|>": 50274,
102
+ "<|ja|>": 50266,
103
+ "<|jw|>": 50356,
104
+ "<|ka|>": 50329,
105
+ "<|kk|>": 50316,
106
+ "<|km|>": 50323,
107
+ "<|kn|>": 50306,
108
+ "<|ko|>": 50264,
109
+ "<|la|>": 50294,
110
+ "<|lb|>": 50345,
111
+ "<|ln|>": 50353,
112
+ "<|lo|>": 50336,
113
+ "<|lt|>": 50293,
114
+ "<|lv|>": 50301,
115
+ "<|mg|>": 50349,
116
+ "<|mi|>": 50295,
117
+ "<|mk|>": 50308,
118
+ "<|ml|>": 50296,
119
+ "<|mn|>": 50314,
120
+ "<|mr|>": 50320,
121
+ "<|ms|>": 50282,
122
+ "<|mt|>": 50343,
123
+ "<|my|>": 50346,
124
+ "<|ne|>": 50313,
125
+ "<|nl|>": 50271,
126
+ "<|nn|>": 50342,
127
+ "<|no|>": 50288,
128
+ "<|oc|>": 50328,
129
+ "<|pa|>": 50321,
130
+ "<|pl|>": 50269,
131
+ "<|ps|>": 50340,
132
+ "<|pt|>": 50267,
133
+ "<|ro|>": 50284,
134
+ "<|ru|>": 50263,
135
+ "<|sa|>": 50344,
136
+ "<|sd|>": 50332,
137
+ "<|si|>": 50322,
138
+ "<|sk|>": 50298,
139
+ "<|sl|>": 50305,
140
+ "<|sn|>": 50324,
141
+ "<|so|>": 50326,
142
+ "<|sq|>": 50317,
143
+ "<|sr|>": 50303,
144
+ "<|su|>": 50357,
145
+ "<|sv|>": 50273,
146
+ "<|sw|>": 50318,
147
+ "<|ta|>": 50287,
148
+ "<|te|>": 50299,
149
+ "<|tg|>": 50331,
150
+ "<|th|>": 50289,
151
+ "<|tk|>": 50341,
152
+ "<|tl|>": 50348,
153
+ "<|tr|>": 50268,
154
+ "<|tt|>": 50351,
155
+ "<|uk|>": 50280,
156
+ "<|ur|>": 50290,
157
+ "<|uz|>": 50337,
158
+ "<|vi|>": 50278,
159
+ "<|yi|>": 50335,
160
+ "<|yo|>": 50325,
161
+ "<|zh|>": 50260
162
+ },
163
+ "max_initial_timestamp_index": 50,
164
+ "max_length": 448,
165
+ "no_timestamps_token_id": 50363,
166
+ "pad_token_id": 50257,
167
+ "prev_sot_token_id": 50361,
168
+ "return_timestamps": false,
169
+ "suppress_tokens": [],
170
+ "task_to_id": {
171
+ "transcribe": 50359,
172
+ "translate": 50358
173
+ },
174
+ "transformers_version": "4.51.3"
175
+ }
checkpoint-1800/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dc6dee26661153af82843dc98a2bf3c620d3eb96f9cccead9a0a30cc189e51e
3
+ size 966995080
checkpoint-1800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477587882c393bea2cc1aefb308921ce1a0f3bba2d063d3ef09781c4d2c9903d
3
+ size 1925064044
checkpoint-1800/preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
checkpoint-1800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c02f80a636544d120b715d6dca9e9c0e47f8b014a7a4861bdf4a0b8572e1441d
3
+ size 14244
checkpoint-1800/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83e85adb2cd8f94e8a73d4675c91bbf77e4ecf8433ddf2b612a7e29cbba96885
3
+ size 988
checkpoint-1800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d69d282d7d5684966027cd3da548aacffa1698b5f2194c7ffab99a06da17983f
3
+ size 1064
checkpoint-1800/trainer_state.json ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1800,
3
+ "best_metric": 33.028076450688445,
4
+ "best_model_checkpoint": "./HAUSA_B/checkpoint-1800",
5
+ "epoch": 2.4360189573459716,
6
+ "eval_steps": 200,
7
+ "global_step": 1800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.027081922816519974,
14
+ "grad_norm": 106.59215545654297,
15
+ "learning_rate": 8.000000000000001e-07,
16
+ "loss": 6.1915,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.05416384563303995,
21
+ "grad_norm": 30.563518524169922,
22
+ "learning_rate": 1.8000000000000001e-06,
23
+ "loss": 4.5886,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.08124576844955991,
28
+ "grad_norm": 16.528457641601562,
29
+ "learning_rate": 2.8000000000000003e-06,
30
+ "loss": 3.0338,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.1083276912660799,
35
+ "grad_norm": 13.852096557617188,
36
+ "learning_rate": 3.8000000000000005e-06,
37
+ "loss": 2.3188,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.13540961408259986,
42
+ "grad_norm": 13.284646987915039,
43
+ "learning_rate": 4.800000000000001e-06,
44
+ "loss": 1.9725,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.16249153689911983,
49
+ "grad_norm": 13.212055206298828,
50
+ "learning_rate": 5.8e-06,
51
+ "loss": 1.7839,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.1895734597156398,
56
+ "grad_norm": 12.006990432739258,
57
+ "learning_rate": 6.800000000000001e-06,
58
+ "loss": 1.5991,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.2166553825321598,
63
+ "grad_norm": 12.490514755249023,
64
+ "learning_rate": 7.800000000000002e-06,
65
+ "loss": 1.4411,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.24373730534867977,
70
+ "grad_norm": 11.587646484375,
71
+ "learning_rate": 8.8e-06,
72
+ "loss": 1.3487,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.2708192281651997,
77
+ "grad_norm": 11.574933052062988,
78
+ "learning_rate": 9.800000000000001e-06,
79
+ "loss": 1.2119,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.2708192281651997,
84
+ "eval_loss": 1.1157827377319336,
85
+ "eval_runtime": 1601.2991,
86
+ "eval_samples_per_second": 3.688,
87
+ "eval_steps_per_second": 0.462,
88
+ "eval_wer": 64.22461160550755,
89
+ "step": 200
90
+ },
91
+ {
92
+ "epoch": 0.2979011509817197,
93
+ "grad_norm": 8.368565559387207,
94
+ "learning_rate": 9.920556107249256e-06,
95
+ "loss": 0.9452,
96
+ "step": 220
97
+ },
98
+ {
99
+ "epoch": 0.32498307379823965,
100
+ "grad_norm": 7.2616682052612305,
101
+ "learning_rate": 9.821251241310825e-06,
102
+ "loss": 0.9298,
103
+ "step": 240
104
+ },
105
+ {
106
+ "epoch": 0.35206499661475965,
107
+ "grad_norm": 7.266394138336182,
108
+ "learning_rate": 9.721946375372395e-06,
109
+ "loss": 0.8567,
110
+ "step": 260
111
+ },
112
+ {
113
+ "epoch": 0.3791469194312796,
114
+ "grad_norm": 8.120067596435547,
115
+ "learning_rate": 9.622641509433963e-06,
116
+ "loss": 0.8251,
117
+ "step": 280
118
+ },
119
+ {
120
+ "epoch": 0.4062288422477996,
121
+ "grad_norm": 7.483547687530518,
122
+ "learning_rate": 9.523336643495532e-06,
123
+ "loss": 0.7774,
124
+ "step": 300
125
+ },
126
+ {
127
+ "epoch": 0.4333107650643196,
128
+ "grad_norm": 7.521896839141846,
129
+ "learning_rate": 9.4240317775571e-06,
130
+ "loss": 0.7266,
131
+ "step": 320
132
+ },
133
+ {
134
+ "epoch": 0.46039268788083954,
135
+ "grad_norm": 7.073266506195068,
136
+ "learning_rate": 9.32472691161867e-06,
137
+ "loss": 0.7248,
138
+ "step": 340
139
+ },
140
+ {
141
+ "epoch": 0.48747461069735953,
142
+ "grad_norm": 6.335423469543457,
143
+ "learning_rate": 9.22542204568024e-06,
144
+ "loss": 0.7151,
145
+ "step": 360
146
+ },
147
+ {
148
+ "epoch": 0.5145565335138795,
149
+ "grad_norm": 6.936922550201416,
150
+ "learning_rate": 9.126117179741808e-06,
151
+ "loss": 0.6881,
152
+ "step": 380
153
+ },
154
+ {
155
+ "epoch": 0.5416384563303994,
156
+ "grad_norm": 7.596807479858398,
157
+ "learning_rate": 9.026812313803377e-06,
158
+ "loss": 0.6995,
159
+ "step": 400
160
+ },
161
+ {
162
+ "epoch": 0.5416384563303994,
163
+ "eval_loss": 0.6352065801620483,
164
+ "eval_runtime": 1597.3635,
165
+ "eval_samples_per_second": 3.697,
166
+ "eval_steps_per_second": 0.463,
167
+ "eval_wer": 51.63653601672089,
168
+ "step": 400
169
+ },
170
+ {
171
+ "epoch": 0.5687203791469194,
172
+ "grad_norm": 6.981812477111816,
173
+ "learning_rate": 8.927507447864945e-06,
174
+ "loss": 0.6795,
175
+ "step": 420
176
+ },
177
+ {
178
+ "epoch": 0.5958023019634394,
179
+ "grad_norm": 6.481506824493408,
180
+ "learning_rate": 8.828202581926516e-06,
181
+ "loss": 0.6643,
182
+ "step": 440
183
+ },
184
+ {
185
+ "epoch": 0.6228842247799594,
186
+ "grad_norm": 6.537086009979248,
187
+ "learning_rate": 8.728897715988084e-06,
188
+ "loss": 0.6459,
189
+ "step": 460
190
+ },
191
+ {
192
+ "epoch": 0.6499661475964793,
193
+ "grad_norm": 6.739567756652832,
194
+ "learning_rate": 8.629592850049653e-06,
195
+ "loss": 0.6736,
196
+ "step": 480
197
+ },
198
+ {
199
+ "epoch": 0.6770480704129993,
200
+ "grad_norm": 7.422546863555908,
201
+ "learning_rate": 8.530287984111221e-06,
202
+ "loss": 0.6591,
203
+ "step": 500
204
+ },
205
+ {
206
+ "epoch": 0.7041299932295193,
207
+ "grad_norm": 5.7051215171813965,
208
+ "learning_rate": 8.430983118172792e-06,
209
+ "loss": 0.6442,
210
+ "step": 520
211
+ },
212
+ {
213
+ "epoch": 0.7312119160460393,
214
+ "grad_norm": 7.166143417358398,
215
+ "learning_rate": 8.33167825223436e-06,
216
+ "loss": 0.6326,
217
+ "step": 540
218
+ },
219
+ {
220
+ "epoch": 0.7582938388625592,
221
+ "grad_norm": 7.759460926055908,
222
+ "learning_rate": 8.232373386295929e-06,
223
+ "loss": 0.6328,
224
+ "step": 560
225
+ },
226
+ {
227
+ "epoch": 0.7853757616790792,
228
+ "grad_norm": 5.876537799835205,
229
+ "learning_rate": 8.133068520357497e-06,
230
+ "loss": 0.5945,
231
+ "step": 580
232
+ },
233
+ {
234
+ "epoch": 0.8124576844955992,
235
+ "grad_norm": 6.475106716156006,
236
+ "learning_rate": 8.033763654419066e-06,
237
+ "loss": 0.6038,
238
+ "step": 600
239
+ },
240
+ {
241
+ "epoch": 0.8124576844955992,
242
+ "eval_loss": 0.5318673849105835,
243
+ "eval_runtime": 1614.0202,
244
+ "eval_samples_per_second": 3.659,
245
+ "eval_steps_per_second": 0.458,
246
+ "eval_wer": 44.61419121291129,
247
+ "step": 600
248
+ },
249
+ {
250
+ "epoch": 0.8395396073121192,
251
+ "grad_norm": 6.8348259925842285,
252
+ "learning_rate": 7.934458788480636e-06,
253
+ "loss": 0.6046,
254
+ "step": 620
255
+ },
256
+ {
257
+ "epoch": 0.8666215301286392,
258
+ "grad_norm": 6.060784339904785,
259
+ "learning_rate": 7.835153922542206e-06,
260
+ "loss": 0.6055,
261
+ "step": 640
262
+ },
263
+ {
264
+ "epoch": 0.8937034529451591,
265
+ "grad_norm": 6.01262903213501,
266
+ "learning_rate": 7.735849056603775e-06,
267
+ "loss": 0.6139,
268
+ "step": 660
269
+ },
270
+ {
271
+ "epoch": 0.9207853757616791,
272
+ "grad_norm": 6.706854820251465,
273
+ "learning_rate": 7.636544190665344e-06,
274
+ "loss": 0.5789,
275
+ "step": 680
276
+ },
277
+ {
278
+ "epoch": 0.9478672985781991,
279
+ "grad_norm": 6.464681625366211,
280
+ "learning_rate": 7.537239324726913e-06,
281
+ "loss": 0.5857,
282
+ "step": 700
283
+ },
284
+ {
285
+ "epoch": 0.9749492213947191,
286
+ "grad_norm": 7.8035478591918945,
287
+ "learning_rate": 7.437934458788482e-06,
288
+ "loss": 0.5739,
289
+ "step": 720
290
+ },
291
+ {
292
+ "epoch": 1.001354096140826,
293
+ "grad_norm": 5.574532985687256,
294
+ "learning_rate": 7.33862959285005e-06,
295
+ "loss": 0.5443,
296
+ "step": 740
297
+ },
298
+ {
299
+ "epoch": 1.028436018957346,
300
+ "grad_norm": 5.275153160095215,
301
+ "learning_rate": 7.23932472691162e-06,
302
+ "loss": 0.4705,
303
+ "step": 760
304
+ },
305
+ {
306
+ "epoch": 1.055517941773866,
307
+ "grad_norm": 5.722527027130127,
308
+ "learning_rate": 7.140019860973188e-06,
309
+ "loss": 0.4692,
310
+ "step": 780
311
+ },
312
+ {
313
+ "epoch": 1.0825998645903858,
314
+ "grad_norm": 5.501762866973877,
315
+ "learning_rate": 7.040714995034758e-06,
316
+ "loss": 0.4791,
317
+ "step": 800
318
+ },
319
+ {
320
+ "epoch": 1.0825998645903858,
321
+ "eval_loss": 0.46895861625671387,
322
+ "eval_runtime": 1599.6376,
323
+ "eval_samples_per_second": 3.691,
324
+ "eval_steps_per_second": 0.462,
325
+ "eval_wer": 40.9726561658299,
326
+ "step": 800
327
+ },
328
+ {
329
+ "epoch": 1.1096817874069058,
330
+ "grad_norm": 5.207645893096924,
331
+ "learning_rate": 6.941410129096326e-06,
332
+ "loss": 0.4569,
333
+ "step": 820
334
+ },
335
+ {
336
+ "epoch": 1.1367637102234258,
337
+ "grad_norm": 5.584475517272949,
338
+ "learning_rate": 6.842105263157896e-06,
339
+ "loss": 0.4262,
340
+ "step": 840
341
+ },
342
+ {
343
+ "epoch": 1.1638456330399458,
344
+ "grad_norm": 5.881894588470459,
345
+ "learning_rate": 6.742800397219464e-06,
346
+ "loss": 0.4533,
347
+ "step": 860
348
+ },
349
+ {
350
+ "epoch": 1.1909275558564658,
351
+ "grad_norm": 6.939334869384766,
352
+ "learning_rate": 6.643495531281034e-06,
353
+ "loss": 0.4675,
354
+ "step": 880
355
+ },
356
+ {
357
+ "epoch": 1.2180094786729858,
358
+ "grad_norm": 5.780360698699951,
359
+ "learning_rate": 6.544190665342602e-06,
360
+ "loss": 0.4238,
361
+ "step": 900
362
+ },
363
+ {
364
+ "epoch": 1.2450914014895058,
365
+ "grad_norm": 5.867160797119141,
366
+ "learning_rate": 6.444885799404172e-06,
367
+ "loss": 0.444,
368
+ "step": 920
369
+ },
370
+ {
371
+ "epoch": 1.2721733243060258,
372
+ "grad_norm": 6.121824741363525,
373
+ "learning_rate": 6.34558093346574e-06,
374
+ "loss": 0.4333,
375
+ "step": 940
376
+ },
377
+ {
378
+ "epoch": 1.2992552471225456,
379
+ "grad_norm": 5.132157802581787,
380
+ "learning_rate": 6.24627606752731e-06,
381
+ "loss": 0.4544,
382
+ "step": 960
383
+ },
384
+ {
385
+ "epoch": 1.3263371699390656,
386
+ "grad_norm": 6.384315013885498,
387
+ "learning_rate": 6.146971201588878e-06,
388
+ "loss": 0.4332,
389
+ "step": 980
390
+ },
391
+ {
392
+ "epoch": 1.3534190927555856,
393
+ "grad_norm": 4.54695987701416,
394
+ "learning_rate": 6.047666335650447e-06,
395
+ "loss": 0.4416,
396
+ "step": 1000
397
+ },
398
+ {
399
+ "epoch": 1.3534190927555856,
400
+ "eval_loss": 0.43043428659439087,
401
+ "eval_runtime": 1614.3899,
402
+ "eval_samples_per_second": 3.658,
403
+ "eval_steps_per_second": 0.458,
404
+ "eval_wer": 38.00771402098731,
405
+ "step": 1000
406
+ },
407
+ {
408
+ "epoch": 1.3805010155721056,
409
+ "grad_norm": 4.973900318145752,
410
+ "learning_rate": 5.948361469712016e-06,
411
+ "loss": 0.4401,
412
+ "step": 1020
413
+ },
414
+ {
415
+ "epoch": 1.4075829383886256,
416
+ "grad_norm": 5.952788352966309,
417
+ "learning_rate": 5.849056603773585e-06,
418
+ "loss": 0.4865,
419
+ "step": 1040
420
+ },
421
+ {
422
+ "epoch": 1.4346648612051456,
423
+ "grad_norm": 6.601942539215088,
424
+ "learning_rate": 5.749751737835154e-06,
425
+ "loss": 0.4435,
426
+ "step": 1060
427
+ },
428
+ {
429
+ "epoch": 1.4617467840216656,
430
+ "grad_norm": 6.17143440246582,
431
+ "learning_rate": 5.650446871896723e-06,
432
+ "loss": 0.451,
433
+ "step": 1080
434
+ },
435
+ {
436
+ "epoch": 1.4888287068381856,
437
+ "grad_norm": 5.782886981964111,
438
+ "learning_rate": 5.551142005958292e-06,
439
+ "loss": 0.4311,
440
+ "step": 1100
441
+ },
442
+ {
443
+ "epoch": 1.5159106296547056,
444
+ "grad_norm": 5.734127998352051,
445
+ "learning_rate": 5.451837140019861e-06,
446
+ "loss": 0.4632,
447
+ "step": 1120
448
+ },
449
+ {
450
+ "epoch": 1.5429925524712256,
451
+ "grad_norm": 5.601761341094971,
452
+ "learning_rate": 5.35253227408143e-06,
453
+ "loss": 0.424,
454
+ "step": 1140
455
+ },
456
+ {
457
+ "epoch": 1.5700744752877456,
458
+ "grad_norm": 5.866110801696777,
459
+ "learning_rate": 5.253227408142999e-06,
460
+ "loss": 0.4273,
461
+ "step": 1160
462
+ },
463
+ {
464
+ "epoch": 1.5971563981042654,
465
+ "grad_norm": 5.120361328125,
466
+ "learning_rate": 5.153922542204568e-06,
467
+ "loss": 0.4297,
468
+ "step": 1180
469
+ },
470
+ {
471
+ "epoch": 1.6242383209207854,
472
+ "grad_norm": 5.093082427978516,
473
+ "learning_rate": 5.054617676266137e-06,
474
+ "loss": 0.4321,
475
+ "step": 1200
476
+ },
477
+ {
478
+ "epoch": 1.6242383209207854,
479
+ "eval_loss": 0.3993258476257324,
480
+ "eval_runtime": 1619.3754,
481
+ "eval_samples_per_second": 3.646,
482
+ "eval_steps_per_second": 0.456,
483
+ "eval_wer": 37.091943373052636,
484
+ "step": 1200
485
+ },
486
+ {
487
+ "epoch": 1.6513202437373053,
488
+ "grad_norm": 5.076949119567871,
489
+ "learning_rate": 4.955312810327706e-06,
490
+ "loss": 0.4446,
491
+ "step": 1220
492
+ },
493
+ {
494
+ "epoch": 1.6784021665538253,
495
+ "grad_norm": 5.729442596435547,
496
+ "learning_rate": 4.856007944389276e-06,
497
+ "loss": 0.4375,
498
+ "step": 1240
499
+ },
500
+ {
501
+ "epoch": 1.7054840893703453,
502
+ "grad_norm": 5.660022258758545,
503
+ "learning_rate": 4.756703078450844e-06,
504
+ "loss": 0.4318,
505
+ "step": 1260
506
+ },
507
+ {
508
+ "epoch": 1.7325660121868651,
509
+ "grad_norm": 5.320847511291504,
510
+ "learning_rate": 4.657398212512414e-06,
511
+ "loss": 0.4255,
512
+ "step": 1280
513
+ },
514
+ {
515
+ "epoch": 1.7596479350033851,
516
+ "grad_norm": 4.652419567108154,
517
+ "learning_rate": 4.558093346573982e-06,
518
+ "loss": 0.4231,
519
+ "step": 1300
520
+ },
521
+ {
522
+ "epoch": 1.7867298578199051,
523
+ "grad_norm": 6.301181793212891,
524
+ "learning_rate": 4.458788480635551e-06,
525
+ "loss": 0.4426,
526
+ "step": 1320
527
+ },
528
+ {
529
+ "epoch": 1.8138117806364251,
530
+ "grad_norm": 5.540495872497559,
531
+ "learning_rate": 4.35948361469712e-06,
532
+ "loss": 0.4361,
533
+ "step": 1340
534
+ },
535
+ {
536
+ "epoch": 1.8408937034529451,
537
+ "grad_norm": 5.618797302246094,
538
+ "learning_rate": 4.260178748758689e-06,
539
+ "loss": 0.4081,
540
+ "step": 1360
541
+ },
542
+ {
543
+ "epoch": 1.8679756262694651,
544
+ "grad_norm": 5.519278526306152,
545
+ "learning_rate": 4.160873882820258e-06,
546
+ "loss": 0.4311,
547
+ "step": 1380
548
+ },
549
+ {
550
+ "epoch": 1.8950575490859851,
551
+ "grad_norm": 5.24718713760376,
552
+ "learning_rate": 4.061569016881828e-06,
553
+ "loss": 0.441,
554
+ "step": 1400
555
+ },
556
+ {
557
+ "epoch": 1.8950575490859851,
558
+ "eval_loss": 0.37384942173957825,
559
+ "eval_runtime": 1616.0813,
560
+ "eval_samples_per_second": 3.654,
561
+ "eval_steps_per_second": 0.457,
562
+ "eval_wer": 35.20006895214291,
563
+ "step": 1400
564
+ },
565
+ {
566
+ "epoch": 1.9221394719025051,
567
+ "grad_norm": 5.9886884689331055,
568
+ "learning_rate": 3.962264150943396e-06,
569
+ "loss": 0.4125,
570
+ "step": 1420
571
+ },
572
+ {
573
+ "epoch": 1.9492213947190251,
574
+ "grad_norm": 5.233187198638916,
575
+ "learning_rate": 3.862959285004966e-06,
576
+ "loss": 0.4179,
577
+ "step": 1440
578
+ },
579
+ {
580
+ "epoch": 1.9763033175355451,
581
+ "grad_norm": 5.533965587615967,
582
+ "learning_rate": 3.763654419066535e-06,
583
+ "loss": 0.4272,
584
+ "step": 1460
585
+ },
586
+ {
587
+ "epoch": 2.002708192281652,
588
+ "grad_norm": 5.179864883422852,
589
+ "learning_rate": 3.664349553128104e-06,
590
+ "loss": 0.4082,
591
+ "step": 1480
592
+ },
593
+ {
594
+ "epoch": 2.029790115098172,
595
+ "grad_norm": 4.748936653137207,
596
+ "learning_rate": 3.565044687189673e-06,
597
+ "loss": 0.3085,
598
+ "step": 1500
599
+ },
600
+ {
601
+ "epoch": 2.056872037914692,
602
+ "grad_norm": 5.267838001251221,
603
+ "learning_rate": 3.4657398212512415e-06,
604
+ "loss": 0.312,
605
+ "step": 1520
606
+ },
607
+ {
608
+ "epoch": 2.083953960731212,
609
+ "grad_norm": 6.315608501434326,
610
+ "learning_rate": 3.3664349553128105e-06,
611
+ "loss": 0.3512,
612
+ "step": 1540
613
+ },
614
+ {
615
+ "epoch": 2.111035883547732,
616
+ "grad_norm": 5.232484340667725,
617
+ "learning_rate": 3.2671300893743795e-06,
618
+ "loss": 0.3184,
619
+ "step": 1560
620
+ },
621
+ {
622
+ "epoch": 2.138117806364252,
623
+ "grad_norm": 4.316751956939697,
624
+ "learning_rate": 3.1678252234359485e-06,
625
+ "loss": 0.3129,
626
+ "step": 1580
627
+ },
628
+ {
629
+ "epoch": 2.1651997291807716,
630
+ "grad_norm": 4.040640354156494,
631
+ "learning_rate": 3.0685203574975175e-06,
632
+ "loss": 0.3073,
633
+ "step": 1600
634
+ },
635
+ {
636
+ "epoch": 2.1651997291807716,
637
+ "eval_loss": 0.3516213595867157,
638
+ "eval_runtime": 1617.0984,
639
+ "eval_samples_per_second": 3.652,
640
+ "eval_steps_per_second": 0.457,
641
+ "eval_wer": 34.23258419703075,
642
+ "step": 1600
643
+ },
644
+ {
645
+ "epoch": 2.1922816519972916,
646
+ "grad_norm": 4.437410831451416,
647
+ "learning_rate": 2.9692154915590865e-06,
648
+ "loss": 0.3293,
649
+ "step": 1620
650
+ },
651
+ {
652
+ "epoch": 2.2193635748138116,
653
+ "grad_norm": 4.284883499145508,
654
+ "learning_rate": 2.8699106256206555e-06,
655
+ "loss": 0.3237,
656
+ "step": 1640
657
+ },
658
+ {
659
+ "epoch": 2.2464454976303316,
660
+ "grad_norm": 4.959640979766846,
661
+ "learning_rate": 2.7706057596822245e-06,
662
+ "loss": 0.3374,
663
+ "step": 1660
664
+ },
665
+ {
666
+ "epoch": 2.2735274204468516,
667
+ "grad_norm": 5.366809368133545,
668
+ "learning_rate": 2.6713008937437935e-06,
669
+ "loss": 0.3086,
670
+ "step": 1680
671
+ },
672
+ {
673
+ "epoch": 2.3006093432633716,
674
+ "grad_norm": 4.363138198852539,
675
+ "learning_rate": 2.571996027805363e-06,
676
+ "loss": 0.331,
677
+ "step": 1700
678
+ },
679
+ {
680
+ "epoch": 2.3276912660798916,
681
+ "grad_norm": 4.7536468505859375,
682
+ "learning_rate": 2.4726911618669315e-06,
683
+ "loss": 0.317,
684
+ "step": 1720
685
+ },
686
+ {
687
+ "epoch": 2.3547731888964116,
688
+ "grad_norm": 4.352657794952393,
689
+ "learning_rate": 2.3733862959285005e-06,
690
+ "loss": 0.3008,
691
+ "step": 1740
692
+ },
693
+ {
694
+ "epoch": 2.3818551117129316,
695
+ "grad_norm": 4.978397846221924,
696
+ "learning_rate": 2.2740814299900695e-06,
697
+ "loss": 0.3245,
698
+ "step": 1760
699
+ },
700
+ {
701
+ "epoch": 2.4089370345294516,
702
+ "grad_norm": 4.570504665374756,
703
+ "learning_rate": 2.174776564051639e-06,
704
+ "loss": 0.3147,
705
+ "step": 1780
706
+ },
707
+ {
708
+ "epoch": 2.4360189573459716,
709
+ "grad_norm": 5.904801845550537,
710
+ "learning_rate": 2.075471698113208e-06,
711
+ "loss": 0.3395,
712
+ "step": 1800
713
+ },
714
+ {
715
+ "epoch": 2.4360189573459716,
716
+ "eval_loss": 0.3366335332393646,
717
+ "eval_runtime": 1614.6836,
718
+ "eval_samples_per_second": 3.657,
719
+ "eval_steps_per_second": 0.458,
720
+ "eval_wer": 33.028076450688445,
721
+ "step": 1800
722
+ }
723
+ ],
724
+ "logging_steps": 20,
725
+ "max_steps": 2214,
726
+ "num_input_tokens_seen": 0,
727
+ "num_train_epochs": 3,
728
+ "save_steps": 200,
729
+ "stateful_callbacks": {
730
+ "TrainerControl": {
731
+ "args": {
732
+ "should_epoch_stop": false,
733
+ "should_evaluate": false,
734
+ "should_log": false,
735
+ "should_save": true,
736
+ "should_training_stop": false
737
+ },
738
+ "attributes": {}
739
+ }
740
+ },
741
+ "total_flos": 1.660635827601408e+19,
742
+ "train_batch_size": 16,
743
+ "trial_name": null,
744
+ "trial_params": null
745
+ }
checkpoint-1800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d182356a12cee606170360f976946e1a56ce9d494fc1de269ffa184a80a19aa
3
+ size 5432