Upload WhisperForConditionalGeneration

#2
by arijitx - opened
Files changed (4) hide show
  1. README.md +6 -11
  2. config.json +5 -9
  3. generation_config.json +175 -0
  4. model.safetensors +3 -0
README.md CHANGED
@@ -1,21 +1,18 @@
1
  ---
2
- language:
3
  - bn
4
  tags:
5
  - audio
6
  - automatic-speech-recognition
7
  - hf-asr-leaderboard
8
- # widget:
9
- # - example_title: Librispeech sample 1
10
- # src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
11
- # - example_title: Librispeech sample 2
12
- # src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
13
  model-index:
14
  - name: whisper-small-bn
15
  results:
16
  - task:
17
- name: Automatic Speech Recognition
18
  type: automatic-speech-recognition
 
19
  dataset:
20
  name: Common Voice 11.0
21
  type: mozilla-foundation/common_voice_11_0
@@ -24,11 +21,9 @@ model-index:
24
  args:
25
  language: bn
26
  metrics:
27
- - name: Test WER
28
- type: wer
29
  value: 35.14
30
- pipeline_tag: automatic-speech-recognition
31
- license: apache-2.0
32
  ---
33
 
34
  # Whisper
 
1
  ---
2
+ language:
3
  - bn
4
  tags:
5
  - audio
6
  - automatic-speech-recognition
7
  - hf-asr-leaderboard
8
+ pipeline_tag: automatic-speech-recognition
9
+ license: apache-2.0
 
 
 
10
  model-index:
11
  - name: whisper-small-bn
12
  results:
13
  - task:
 
14
  type: automatic-speech-recognition
15
+ name: Automatic Speech Recognition
16
  dataset:
17
  name: Common Voice 11.0
18
  type: mozilla-foundation/common_voice_11_0
 
21
  args:
22
  language: bn
23
  metrics:
24
+ - type: wer
 
25
  value: 35.14
26
+ name: Test WER
 
27
  ---
28
 
29
  # Whisper
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "openai/whisper-small",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
  "apply_spec_augment": false,
@@ -7,10 +6,7 @@
7
  "WhisperForConditionalGeneration"
8
  ],
9
  "attention_dropout": 0.0,
10
- "begin_suppress_tokens": [
11
- 220,
12
- 50257
13
- ],
14
  "bos_token_id": 50257,
15
  "classifier_proj_size": 256,
16
  "d_model": 768,
@@ -20,6 +16,7 @@
20
  "decoder_layers": 12,
21
  "decoder_start_token_id": 50258,
22
  "dropout": 0.0,
 
23
  "encoder_attention_heads": 12,
24
  "encoder_ffn_dim": 3072,
25
  "encoder_layerdrop": 0.0,
@@ -34,17 +31,16 @@
34
  "mask_time_length": 10,
35
  "mask_time_min_masks": 2,
36
  "mask_time_prob": 0.05,
37
- "max_length": 448,
38
  "max_source_positions": 1500,
39
  "max_target_positions": 448,
 
40
  "model_type": "whisper",
41
  "num_hidden_layers": 12,
42
  "num_mel_bins": 80,
43
  "pad_token_id": 50257,
44
  "scale_embedding": false,
45
- "suppress_tokens": [],
46
- "torch_dtype": "float32",
47
- "transformers_version": "4.28.0.dev0",
48
  "use_cache": true,
49
  "use_weighted_layer_sum": false,
50
  "vocab_size": 51865
 
1
  {
 
2
  "activation_dropout": 0.0,
3
  "activation_function": "gelu",
4
  "apply_spec_augment": false,
 
6
  "WhisperForConditionalGeneration"
7
  ],
8
  "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": null,
 
 
 
10
  "bos_token_id": 50257,
11
  "classifier_proj_size": 256,
12
  "d_model": 768,
 
16
  "decoder_layers": 12,
17
  "decoder_start_token_id": 50258,
18
  "dropout": 0.0,
19
+ "dtype": "float32",
20
  "encoder_attention_heads": 12,
21
  "encoder_ffn_dim": 3072,
22
  "encoder_layerdrop": 0.0,
 
31
  "mask_time_length": 10,
32
  "mask_time_min_masks": 2,
33
  "mask_time_prob": 0.05,
34
+ "max_length": null,
35
  "max_source_positions": 1500,
36
  "max_target_positions": 448,
37
+ "median_filter_width": 7,
38
  "model_type": "whisper",
39
  "num_hidden_layers": 12,
40
  "num_mel_bins": 80,
41
  "pad_token_id": 50257,
42
  "scale_embedding": false,
43
+ "transformers_version": "4.56.1",
 
 
44
  "use_cache": true,
45
  "use_weighted_layer_sum": false,
46
  "vocab_size": 51865
generation_config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 5,
5
+ 3
6
+ ],
7
+ [
8
+ 5,
9
+ 9
10
+ ],
11
+ [
12
+ 8,
13
+ 0
14
+ ],
15
+ [
16
+ 8,
17
+ 4
18
+ ],
19
+ [
20
+ 8,
21
+ 7
22
+ ],
23
+ [
24
+ 8,
25
+ 8
26
+ ],
27
+ [
28
+ 9,
29
+ 0
30
+ ],
31
+ [
32
+ 9,
33
+ 7
34
+ ],
35
+ [
36
+ 9,
37
+ 9
38
+ ],
39
+ [
40
+ 10,
41
+ 5
42
+ ]
43
+ ],
44
+ "begin_suppress_tokens": [
45
+ 220,
46
+ 50257
47
+ ],
48
+ "bos_token_id": 50257,
49
+ "decoder_start_token_id": 50258,
50
+ "eos_token_id": 50257,
51
+ "forced_decoder_ids": [
52
+ [
53
+ 1,
54
+ null
55
+ ],
56
+ [
57
+ 2,
58
+ 50359
59
+ ]
60
+ ],
61
+ "is_multilingual": true,
62
+ "lang_to_id": {
63
+ "<|af|>": 50327,
64
+ "<|am|>": 50334,
65
+ "<|ar|>": 50272,
66
+ "<|as|>": 50350,
67
+ "<|az|>": 50304,
68
+ "<|ba|>": 50355,
69
+ "<|be|>": 50330,
70
+ "<|bg|>": 50292,
71
+ "<|bn|>": 50302,
72
+ "<|bo|>": 50347,
73
+ "<|br|>": 50309,
74
+ "<|bs|>": 50315,
75
+ "<|ca|>": 50270,
76
+ "<|cs|>": 50283,
77
+ "<|cy|>": 50297,
78
+ "<|da|>": 50285,
79
+ "<|de|>": 50261,
80
+ "<|el|>": 50281,
81
+ "<|en|>": 50259,
82
+ "<|es|>": 50262,
83
+ "<|et|>": 50307,
84
+ "<|eu|>": 50310,
85
+ "<|fa|>": 50300,
86
+ "<|fi|>": 50277,
87
+ "<|fo|>": 50338,
88
+ "<|fr|>": 50265,
89
+ "<|gl|>": 50319,
90
+ "<|gu|>": 50333,
91
+ "<|haw|>": 50352,
92
+ "<|ha|>": 50354,
93
+ "<|he|>": 50279,
94
+ "<|hi|>": 50276,
95
+ "<|hr|>": 50291,
96
+ "<|ht|>": 50339,
97
+ "<|hu|>": 50286,
98
+ "<|hy|>": 50312,
99
+ "<|id|>": 50275,
100
+ "<|is|>": 50311,
101
+ "<|it|>": 50274,
102
+ "<|ja|>": 50266,
103
+ "<|jw|>": 50356,
104
+ "<|ka|>": 50329,
105
+ "<|kk|>": 50316,
106
+ "<|km|>": 50323,
107
+ "<|kn|>": 50306,
108
+ "<|ko|>": 50264,
109
+ "<|la|>": 50294,
110
+ "<|lb|>": 50345,
111
+ "<|ln|>": 50353,
112
+ "<|lo|>": 50336,
113
+ "<|lt|>": 50293,
114
+ "<|lv|>": 50301,
115
+ "<|mg|>": 50349,
116
+ "<|mi|>": 50295,
117
+ "<|mk|>": 50308,
118
+ "<|ml|>": 50296,
119
+ "<|mn|>": 50314,
120
+ "<|mr|>": 50320,
121
+ "<|ms|>": 50282,
122
+ "<|mt|>": 50343,
123
+ "<|my|>": 50346,
124
+ "<|ne|>": 50313,
125
+ "<|nl|>": 50271,
126
+ "<|nn|>": 50342,
127
+ "<|no|>": 50288,
128
+ "<|oc|>": 50328,
129
+ "<|pa|>": 50321,
130
+ "<|pl|>": 50269,
131
+ "<|ps|>": 50340,
132
+ "<|pt|>": 50267,
133
+ "<|ro|>": 50284,
134
+ "<|ru|>": 50263,
135
+ "<|sa|>": 50344,
136
+ "<|sd|>": 50332,
137
+ "<|si|>": 50322,
138
+ "<|sk|>": 50298,
139
+ "<|sl|>": 50305,
140
+ "<|sn|>": 50324,
141
+ "<|so|>": 50326,
142
+ "<|sq|>": 50317,
143
+ "<|sr|>": 50303,
144
+ "<|su|>": 50357,
145
+ "<|sv|>": 50273,
146
+ "<|sw|>": 50318,
147
+ "<|ta|>": 50287,
148
+ "<|te|>": 50299,
149
+ "<|tg|>": 50331,
150
+ "<|th|>": 50289,
151
+ "<|tk|>": 50341,
152
+ "<|tl|>": 50348,
153
+ "<|tr|>": 50268,
154
+ "<|tt|>": 50351,
155
+ "<|uk|>": 50280,
156
+ "<|ur|>": 50290,
157
+ "<|uz|>": 50337,
158
+ "<|vi|>": 50278,
159
+ "<|yi|>": 50335,
160
+ "<|yo|>": 50325,
161
+ "<|zh|>": 50260
162
+ },
163
+ "max_initial_timestamp_index": 50,
164
+ "max_length": 448,
165
+ "no_timestamps_token_id": 50363,
166
+ "pad_token_id": 50257,
167
+ "prev_sot_token_id": 50361,
168
+ "return_timestamps": false,
169
+ "suppress_tokens": [],
170
+ "task_to_id": {
171
+ "transcribe": 50359,
172
+ "translate": 50358
173
+ },
174
+ "transformers_version": "4.56.1"
175
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93a91b02485891b74985c1d879e7df6662f123cc96174c2257a23ded0b418f61
3
+ size 966995080