Steveeeeeeen HF Staff commited on
Commit
1e874c9
ยท
verified ยท
1 Parent(s): aa72be5

add longform tab

Browse files
Files changed (1) hide show
  1. constants.py +164 -0
constants.py CHANGED
@@ -13,6 +13,34 @@ BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="
13
 
14
  TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> ๐Ÿค— Open Automatic Speech Recognition Leaderboard </b> </body> </html>"
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  INTRODUCTION_TEXT = "๐Ÿ“ The ๐Ÿค— Open ASR Leaderboard ranks and evaluates speech recognition models \
17
  on the Hugging Face Hub. \
18
  \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (โฌ‡๏ธ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (โฌ†๏ธ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the ๐Ÿ“ˆ Metrics tab to understand how the models are evaluated. \
@@ -28,6 +56,142 @@ CITATION_TEXT = """@misc{open-asr-leaderboard,
28
  }
29
  """
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  METRICS_TAB_TEXT = """
32
  Here you will find details about the speech recognition metrics and datasets reported in our leaderboard.
33
 
 
13
 
14
  TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> ๐Ÿค— Open Automatic Speech Recognition Leaderboard </b> </body> </html>"
15
 
16
+ INTRODUCTION_TEXT = "๐Ÿ“ The ๐Ÿค— Open ASR Leaderboard ranks and evaluates speech recognition models \
17
+ on the Hugging Face Hub. \
18
+ \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (โฌ‡๏ธ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (โฌ†๏ธ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the ๐Ÿ“ˆ Metrics tab to understand how the models are evaluated. \
19
+ \nIf you want results for a model that is not listed here, you can submit a request for it to be included โœ‰๏ธโœจ. \
20
+ \nThe leaderboard includes both English ASR evaluation and multilingual benchmarks across the top European languages."
21
+
22
+ CITATION_TEXT = """@misc{open-asr-leaderboard,
23
+ title = {Open Automatic Speech Recognition Leaderboard},
24
+ author = {Srivastav, Vaibhav and Majumdar, Somshubra and Koluguri, Nithin and Moumen, Adel and Gandhi, Sanchit and others},
25
+ year = 2023,
26
+ publisher = {Hugging Face},
27
+ howpublished = "\\url{https://huggingface.co/spaces/hf-audio/open_asr_leaderboard}"
28
+ }
29
+ """from pathlib import Path
30
+
31
+ # Directory where request by models are stored
32
+ DIR_OUTPUT_REQUESTS = Path("requested_models")
33
+ EVAL_REQUESTS_PATH = Path("eval_requests")
34
+
35
+ ##########################
36
+ # Text definitions #
37
+ ##########################
38
+
39
+ banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/asr_leaderboard.png"
40
+ BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
41
+
42
+ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> ๐Ÿค— Open Automatic Speech Recognition Leaderboard </b> </body> </html>"
43
+
44
  INTRODUCTION_TEXT = "๐Ÿ“ The ๐Ÿค— Open ASR Leaderboard ranks and evaluates speech recognition models \
45
  on the Hugging Face Hub. \
46
  \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (โฌ‡๏ธ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (โฌ†๏ธ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the ๐Ÿ“ˆ Metrics tab to understand how the models are evaluated. \
 
56
  }
57
  """
58
 
59
+ METRICS_TAB_TEXT = """
60
+ Here you will find details about the speech recognition metrics and datasets reported in our leaderboard.
61
+
62
+ ## Metrics
63
+
64
+ Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric
65
+ is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based
66
+ on their WER, lowest to highest.
67
+
68
+ Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold:
69
+ 1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish.
70
+ 2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios).
71
+
72
+ For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model).
73
+
74
+ ### Word Error Rate (WER)
75
+
76
+ Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
77
+ of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
78
+
79
+ Take the following example:
80
+
81
+ | Reference: | the | cat | sat | on | the | mat |
82
+ |-------------|-----|-----|---------|-----|-----|-----|
83
+ | Prediction: | the | cat | **sit** | on | the | | |
84
+ | Label: | โœ… | โœ… | S | โœ… | โœ… | D |
85
+
86
+ Here, we have:
87
+ * 1 substitution ("sit" instead of "sat")
88
+ * 0 insertions
89
+ * 1 deletion ("mat" is missing)
90
+
91
+ This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
92
+ reference (N), which for this example is 6:
93
+
94
+ ```
95
+ WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
96
+ ```
97
+
98
+ Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
99
+
100
+ ### Inverse Real Time Factor (RTFx)
101
+
102
+ Inverse Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
103
+ model to process a given amount of speech. It is defined as:
104
+ ```
105
+ RTFx = (number of seconds of audio inferred) / (compute time in seconds)
106
+ ```
107
+
108
+ Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
109
+ Thus, **a higher RTFx value indicates lower latency**.
110
+
111
+ ## How to reproduce our results
112
+
113
+ The ASR Leaderboard will be a continued effort to benchmark open source/access speech recognition models where possible.
114
+ Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
115
+ For more details head over to our repo at: https://github.com/huggingface/open_asr_leaderboard
116
+
117
+ P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! โ™ฅ๏ธ
118
+
119
+ ## Benchmark datasets
120
+
121
+ Evaluating Speech Recognition systems is a hard problem. We use the multi-dataset benchmarking strategy proposed in the
122
+ [ESB paper](https://arxiv.org/abs/2210.13352) to obtain robust evaluation scores for each model.
123
+
124
+ ESB is a benchmark for evaluating the performance of a single automatic speech recognition (ASR) system across a broad
125
+ set of speech datasets. It comprises eight English speech recognition datasets, capturing a broad range of domains,
126
+ acoustic conditions, speaker styles, and transcription requirements. As such, it gives a better indication of how
127
+ a model is likely to perform on downstream ASR compared to evaluating it on one dataset alone.
128
+
129
+ The ESB score is calculated as a macro-average of the WER scores across the ESB datasets. The models in the leaderboard
130
+ are ranked based on their average WER scores, from lowest to highest.
131
+
132
+ | Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
133
+ |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
134
+ | [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
135
+ | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
136
+ | [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
137
+ | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
138
+ | [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) | Financial meetings | Oratory, spontaneous | 4900 | 100 | 100 | Punctuated & Cased | User Agreement |
139
+ | [Earnings-22](https://huggingface.co/datasets/revdotcom/earnings22) | Financial meetings | Oratory, spontaneous | 105 | 5 | 5 | Punctuated & Cased | CC-BY-SA-4.0 |
140
+ | [AMI](https://huggingface.co/datasets/edinburghcstr/ami) | Meetings | Spontaneous | 78 | 9 | 9 | Punctuated & Cased | CC-BY-4.0 |
141
+
142
+ For more details on the individual datasets and how models are evaluated to give the ESB score, refer to the [ESB paper](https://arxiv.org/abs/2210.13352).
143
+ """
144
+
145
+ # Multilingual benchmark definitions
146
+ EU_LANGUAGES = {
147
+ "de": {"name": "German", "flag": "๐Ÿ‡ฉ๐Ÿ‡ช", "datasets": ["mls", "fleurs", "covost"]},
148
+ "fr": {"name": "French", "flag": "๐Ÿ‡ซ๐Ÿ‡ท", "datasets": ["mls", "fleurs", "covost"]},
149
+ "it": {"name": "Italian", "flag": "๐Ÿ‡ฎ๐Ÿ‡น", "datasets": ["mls", "fleurs", "covost"]},
150
+ "es": {"name": "Spanish", "flag": "๐Ÿ‡ช๐Ÿ‡ธ", "datasets": ["mls", "fleurs", "covost"]},
151
+ "pt": {"name": "Portuguese", "flag": "๐Ÿ‡ต๐Ÿ‡น", "datasets": ["mls", "fleurs", "covost"]}
152
+ }
153
+
154
+ MULTILINGUAL_TAB_TEXT = """
155
+ ## ๐ŸŒ Multilingual ASR Evaluation
156
+
157
+ """
158
+
159
+ LONGFORM_TAB_TEXT = """
160
+ ## ๐Ÿ“ Long-form ASR Evaluation
161
+
162
+ """
163
+
164
+ LEADERBOARD_CSS = """
165
+ #leaderboard-table th .header-content {
166
+ white-space: nowrap;
167
+ }
168
+
169
+ #multilingual-table th .header-content {
170
+ white-space: nowrap;
171
+ }
172
+
173
+ #multilingual-table th:hover {
174
+ background-color: var(--table-row-focus);
175
+ }
176
+
177
+ #longform-table th .header-content {
178
+ white-space: nowrap;
179
+ }
180
+
181
+ #longform-table th:hover {
182
+ background-color: var(--table-row-focus);
183
+ }
184
+
185
+ .language-detail-modal {
186
+ background: var(--background-fill-primary);
187
+ border: 1px solid var(--border-color-primary);
188
+ border-radius: 8px;
189
+ padding: 1rem;
190
+ margin: 1rem 0;
191
+ }
192
+ """
193
+
194
+
195
  METRICS_TAB_TEXT = """
196
  Here you will find details about the speech recognition metrics and datasets reported in our leaderboard.
197