Buckets:

rtrm's picture
download
raw
72.4 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Evaluation metrics for ASR&quot;,&quot;local&quot;:&quot;evaluation-metrics-for-asr&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Word Error Rate&quot;,&quot;local&quot;:&quot;word-error-rate&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Word Accuracy&quot;,&quot;local&quot;:&quot;word-accuracy&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Character Error Rate&quot;,&quot;local&quot;:&quot;character-error-rate&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Which metric should I use?&quot;,&quot;local&quot;:&quot;which-metric-should-i-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Normalisation&quot;,&quot;local&quot;:&quot;normalisation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Putting it all together&quot;,&quot;local&quot;:&quot;putting-it-all-together&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/audio-course/pr_201/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/entry/start.367c4d78.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/scheduler.f7e1785c.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/singletons.0d70d4cc.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/index.279db187.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/paths.274f629d.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/entry/app.4c54ebf9.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/index.9f8f0838.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/nodes/0.e329f606.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/nodes/31.13a2de8f.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/Tip.4575d9cf.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/CodeBlock.b3510e34.js">
<link rel="modulepreload" href="/docs/audio-course/pr_201/en/_app/immutable/chunks/EditOnGithub.5a9bb8c5.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Evaluation metrics for ASR&quot;,&quot;local&quot;:&quot;evaluation-metrics-for-asr&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Word Error Rate&quot;,&quot;local&quot;:&quot;word-error-rate&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Word Accuracy&quot;,&quot;local&quot;:&quot;word-accuracy&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Character Error Rate&quot;,&quot;local&quot;:&quot;character-error-rate&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Which metric should I use?&quot;,&quot;local&quot;:&quot;which-metric-should-i-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Normalisation&quot;,&quot;local&quot;:&quot;normalisation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Putting it all together&quot;,&quot;local&quot;:&quot;putting-it-all-together&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="evaluation-metrics-for-asr" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluation-metrics-for-asr"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluation metrics for ASR</span></h1> <p data-svelte-h="svelte-bhh0s4">If you’re familiar with the <a href="https://en.wikipedia.org/wiki/Levenshtein_distance" rel="nofollow">Levenshtein distance</a> from NLP, the
metrics for assessing speech recognition systems will be familiar! Don’t worry if you’re not, we’ll go through the
explanations start-to-finish to make sure you know the different metrics and understand what they mean.</p> <p data-svelte-h="svelte-14gi20s">When assessing speech recognition systems, we compare the system’s predictions to the target text transcriptions,
annotating any errors that are present. We categorise these errors into one of three categories:</p> <ol data-svelte-h="svelte-1rakz49"><li>Substitutions (S): where we transcribe the <strong>wrong word</strong> in our prediction (“sit” instead of “sat”)</li> <li>Insertions (I): where we add an <strong>extra word</strong> in our prediction</li> <li>Deletions (D): where we <strong>remove a word</strong> in our prediction</li></ol> <p data-svelte-h="svelte-1u4b8et">These error categories are the same for all speech recognition metrics. What differs is the level at which we compute
these errors: we can either compute them on the <em>word level</em> or on the <em>character level</em>.</p> <p data-svelte-h="svelte-1hnsgur">We’ll use a running example for each of the metric definitions. Here, we have a <em>ground truth</em> or <em>reference</em> text sequence:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->reference = <span class="hljs-string">&quot;the cat sat on the mat&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1oavasl">And a predicted sequence from the speech recognition system that we’re trying to assess:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prediction = <span class="hljs-string">&quot;the cat sit on the&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-meydoa">We can see that the prediction is pretty close, but some words are not quite right. We’ll evaluate this prediction
against the reference for the three most popular speech recognition metrics and see what sort of numbers we get for each.</p> <h2 class="relative group"><a id="word-error-rate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#word-error-rate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Word Error Rate</span></h2> <p data-svelte-h="svelte-etln9n">The <em>word error rate (WER)</em> metric is the ‘de facto’ metric for speech recognition. It calculates substitutions,
insertions and deletions on the <em>word level</em>. This means errors are annotated on a word-by-word basis. Take our example:</p> <table data-svelte-h="svelte-1p0gpk"><thead><tr><th>Reference:</th> <th>the</th> <th>cat</th> <th>sat</th> <th>on</th> <th>the</th> <th>mat</th></tr></thead> <tbody><tr><td>Prediction:</td> <td>the</td> <td>cat</td> <td><strong>sit</strong></td> <td>on</td> <td>the</td> <td></td></tr> <tr><td>Label:</td> <td></td> <td></td> <td>S</td> <td></td> <td></td> <td>D</td></tr></tbody></table> <p data-svelte-h="svelte-83gpva">Here, we have:</p> <ul data-svelte-h="svelte-rr6mcc"><li>1 substitution (“sit” instead of “sat”)</li> <li>0 insertions</li> <li>1 deletion (“mat” is missing)</li></ul> <p>This gives 2 errors in total. To get our error rate, we divide the number of errors by the total number of words in our
reference (N), which for this example is 6:
<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mtable rowspacing="0.25em" columnalign="right left" columnspacing="0em"><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mi>W</mi><mi>E</mi><mi>R</mi></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mrow></mrow><mo>=</mo><mfrac><mrow><mi>S</mi><mo>+</mo><mi>I</mi><mo>+</mo><mi>D</mi></mrow><mi>N</mi></mfrac></mrow></mstyle></mtd></mtr><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mrow></mrow><mo>=</mo><mfrac><mrow><mn>1</mn><mo>+</mo><mn>0</mn><mo>+</mo><mn>1</mn></mrow><mn>6</mn></mfrac></mrow></mstyle></mtd></mtr><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mrow></mrow><mo>=</mo><mn>0.333</mn></mrow></mstyle></mtd></mtr></mtable><annotation encoding="application/x-tex">
\begin{aligned}
WER &amp;= \frac{S + I + D}{N} \\
&amp;= \frac{1 + 0 + 1}{6} \\
&amp;= 0.333
\end{aligned}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:6.1538em;vertical-align:-2.8269em;"></span><span class="mord"><span class="mtable"><span class="col-align-r"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:3.3269em;"><span style="top:-5.3269em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.13889em;">W</span><span class="mord mathnormal" style="margin-right:0.00773em;">ER</span></span></span><span style="top:-3.0194em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"></span></span><span style="top:-1.1934em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:2.8269em;"><span></span></span></span></span></span><span class="col-align-l"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:3.3269em;"><span style="top:-5.3269em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3603em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10903em;">N</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.05764em;">S</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.07847em;">I</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">D</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span><span style="top:-3.0194em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">6</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord">0</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span><span style="top:-1.1934em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">0.333</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:2.8269em;"><span></span></span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-1x9kpft">Alright! So we have a WER of 0.333, or 33.3%. Notice how the word “sit” only has one character that is wrong, but the
entire word is marked incorrect. This is a defining feature of the WER: spelling errors are penalised heavily, no matter
how minor they are.</p> <p data-svelte-h="svelte-1j0jh50">The WER is defined such that <em>lower is better</em>: a lower WER means there are fewer errors in our prediction, so a perfect
speech recognition system would have a WER of zero (no errors).</p> <p data-svelte-h="svelte-1ia5odq">Let’s see how we can compute the WER using 🤗 Evaluate. We’ll need two packages to compute our WER metric: 🤗 Evaluate
for the API interface, and JIWER to do the heavy lifting of running the calculation:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install <span class="hljs-comment">--upgrade evaluate jiwer</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vl6c9u">Great! We can now load up the WER metric and compute the figure for our example:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> load
wer_metric = load(<span class="hljs-string">&quot;wer&quot;</span>)
wer = wer_metric.compute(references=[reference], predictions=[prediction])
<span class="hljs-built_in">print</span>(wer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-j2cy7r"><strong>Print Output:</strong></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">0</span>.<span class="hljs-number">3333333333333333</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ppsr8o">0.33, or 33.3%, as expected! We now know what’s going on under-the-hood with this WER calculation.</p> <p data-svelte-h="svelte-uoepkq">Now, here’s something that’s quite confusing… What do you think the upper limit of the WER is? You would expect it to be
1 or 100% right? Nuh uh! Since the WER is the ratio of errors to number of words (N), there is no upper limit on the WER!
Let’s take an example were we predict 10 words and the target only has 2 words. If all of our predictions were wrong (10 errors),
we’d have a WER of 10 / 2 = 5, or 500%! This is something to bear in mind if you train an ASR system and see a WER of over
100%. Although if you’re seeing this, something has likely gone wrong… 😅</p> <h2 class="relative group"><a id="word-accuracy" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#word-accuracy"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Word Accuracy</span></h2> <p>We can flip the WER around to give us a metric where <em data-svelte-h="svelte-efgwkj">higher is better</em>. Rather than measuring the word error rate,
we can measure the <em data-svelte-h="svelte-wk6fn0">word accuracy (WAcc)</em> of our system:
<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mtable rowspacing="0.16em" columnspacing="1em"><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mi>W</mi><mi>A</mi><mi>c</mi><mi>c</mi><mo>=</mo><mn>1</mn><mo></mo><mi>W</mi><mi>E</mi><mi>R</mi></mrow></mstyle></mtd></mtr></mtable><annotation encoding="application/x-tex">
\begin{equation}
WAcc = 1 - WER \nonumber
\end{equation}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.2em;vertical-align:-0.35em;"></span><span class="mord"><span class="mtable"><span class="col-align-c"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.85em;"><span style="top:-3.01em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.13889em;">W</span><span class="mord mathnormal">A</span><span class="mord mathnormal">cc</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin"></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">W</span><span class="mord mathnormal" style="margin-right:0.00773em;">ER</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.35em;"><span></span></span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-14zvp5d">The WAcc is also measured on the word-level, it’s just the WER reformulated as an accuracy metric rather than an error
metric. The WAcc is very infrequently quoted in the speech literature - we think of our system predictions in terms of
word errors, and so prefer error rate metrics that are more associated with these error type annotations.</p> <h2 class="relative group"><a id="character-error-rate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#character-error-rate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Character Error Rate</span></h2> <p data-svelte-h="svelte-cssi9y">It seems a bit unfair that we marked the entire word for “sit” wrong when in fact only one letter was incorrect.
That’s because we were evaluating our system on the word level, thereby annotating errors on a word-by-word basis.
The <em>character error rate (CER)</em> assesses systems on the <em>character level</em>. This means we divide up our words into their
individual characters, and annotate errors on a character-by-character basis:</p> <table data-svelte-h="svelte-1jkokkh"><thead><tr><th>Reference:</th> <th>t</th> <th>h</th> <th>e</th> <th></th> <th>c</th> <th>a</th> <th>t</th> <th></th> <th>s</th> <th>a</th> <th>t</th> <th></th> <th>o</th> <th>n</th> <th></th> <th>t</th> <th>h</th> <th>e</th> <th></th> <th>m</th> <th>a</th> <th>t</th></tr></thead> <tbody><tr><td>Prediction:</td> <td>t</td> <td>h</td> <td>e</td> <td></td> <td>c</td> <td>a</td> <td>t</td> <td></td> <td>s</td> <td><strong>i</strong></td> <td>t</td> <td></td> <td>o</td> <td>n</td> <td></td> <td>t</td> <td>h</td> <td>e</td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td>Label:</td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td>S</td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td></td> <td>D</td> <td>D</td> <td>D</td></tr></tbody></table> <p data-svelte-h="svelte-jzeweo">We can see now that for the word “sit”, the “s” and “t” are marked as correct. It’s only the “i” which is labelled as a
substitution error (S). Thus, we reward our system for the partially correct prediction 🤝</p> <p>In our example, we have 1 character substitution, 0 insertions, and 3 deletions. In total, we have 14 characters. So, our CER is:
<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mtable rowspacing="0.25em" columnalign="right left" columnspacing="0em"><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mi>C</mi><mi>E</mi><mi>R</mi></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mrow></mrow><mo>=</mo><mfrac><mrow><mi>S</mi><mo>+</mo><mi>I</mi><mo>+</mo><mi>D</mi></mrow><mi>N</mi></mfrac></mrow></mstyle></mtd></mtr><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mrow></mrow><mo>=</mo><mfrac><mrow><mn>1</mn><mo>+</mo><mn>0</mn><mo>+</mo><mn>3</mn></mrow><mn>14</mn></mfrac></mrow></mstyle></mtd></mtr><mtr><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="true"><mrow><mrow></mrow><mo>=</mo><mn>0.286</mn></mrow></mstyle></mtd></mtr></mtable><annotation encoding="application/x-tex">
\begin{aligned}
CER &amp;= \frac{S + I + D}{N} \\
&amp;= \frac{1 + 0 + 3}{14} \\
&amp;= 0.286
\end{aligned}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:6.1538em;vertical-align:-2.8269em;"></span><span class="mord"><span class="mtable"><span class="col-align-r"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:3.3269em;"><span style="top:-5.3269em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.00773em;">CER</span></span></span><span style="top:-3.0194em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"></span></span><span style="top:-1.1934em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:2.8269em;"><span></span></span></span></span></span><span class="col-align-l"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:3.3269em;"><span style="top:-5.3269em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3603em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10903em;">N</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.05764em;">S</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.07847em;">I</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">D</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span><span style="top:-3.0194em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">14</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord">0</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord">3</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span><span style="top:-1.1934em;"><span class="pstrut" style="height:3.3603em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">0.286</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:2.8269em;"><span></span></span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-nqrf1c">Right! We have a CER of 0.286, or 28.6%. Notice how this is lower than our WER - we penalised the spelling error much less.</p> <h2 class="relative group"><a id="which-metric-should-i-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#which-metric-should-i-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Which metric should I use?</span></h2> <p data-svelte-h="svelte-1gur6wt">In general, the WER is used far more than the CER for assessing speech systems. This is because the WER requires systems
to have greater understanding of the context of the predictions. In our example, “sit” is in the wrong tense. A system
that understands the relationship between the verb and tense of the sentence would have predicted the correct verb tense
of “sat”. We want to encourage this level of understanding from our speech systems. So although the WER is less forgiving than
the CER, it’s also more conducive to the kinds of intelligible systems we want to develop. Therefore, we typically use
the WER and would encourage you to as well! However, there are circumstances where it is not possible to use the WER.
Certain languages, such as Mandarin and Japanese, have no notion of ‘words’, and so the WER is meaningless. Here, we revert
to using the CER.</p> <p data-svelte-h="svelte-sm17uh">In our example, we only used one sentence when computing the WER. We would typically use an entire test set consisting
of several thousand sentences when evaluating a real system. When evaluating over multiple sentences, we aggregate S, I, D
and N across all sentences, and then compute the WER according to the formula defined above. This gives a better estimate
of the WER for unseen data.</p> <h2 class="relative group"><a id="normalisation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#normalisation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Normalisation</span></h2> <p data-svelte-h="svelte-npx9p1">If we train an ASR model on data with punctuation and casing, it will learn to predict casing and punctuation in its
transcriptions. This is great when we want to use our model for actual speech recognition applications, such as
transcribing meetings or dictation, since the predicted transcriptions will be fully formatted with casing and punctuation,
a style referred to as <em>orthographic</em>.</p> <p data-svelte-h="svelte-8y51ri">However, we also have the option of <em>normalising</em> the dataset to remove any casing and punctuation. Normalising the
dataset makes the speech recognition task easier: the model no longer needs to distinguish between upper and lower case
characters, or have to predict punctuation from the audio data alone (e.g. what sound does a semi-colon make?).
Because of this, the word error rates are naturally lower (meaning the results are better). The Whisper paper demonstrates
the drastic effect that normalising transcriptions can have on WER results (<em>c.f.</em> Section 4.4 of the <a href="https://cdn.openai.com/papers/whisper.pdf" rel="nofollow">Whisper paper</a>).
While we get lower WERs, the model isn’t necessarily better for production. The lack of casing and punctuation makes the predicted
text from the model significantly harder to read. Take the example from the <a href="asr_models">previous section</a>, where we ran
Wav2Vec2 and Whisper on the same audio sample from the LibriSpeech dataset. The Wav2Vec2 model predicts neither punctuation
nor casing, whereas Whisper predicts both. Comparing the transcriptions side-by-side, we see that the Whisper transcription
is far easier to read:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-symbol">Wav2Vec2:</span> HE TELLS US THAT <span class="hljs-built_in">AT</span> THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAUS <span class="hljs-keyword">AND </span>ROSE <span class="hljs-keyword">BEEF </span>LOOMING <span class="hljs-keyword">BEFORE </span>US SIMALYIS DRAWN FROM EATING <span class="hljs-keyword">AND </span>ITS RESULTS OCCUR MOST READILY TO THE MIND
<span class="hljs-symbol">Whisper:</span> He tells us that <span class="hljs-built_in">at</span> this festive season of the year, with Christmas <span class="hljs-keyword">and </span>roast <span class="hljs-keyword">beef </span>looming <span class="hljs-keyword">before </span>us, similarly is drawn from eating <span class="hljs-keyword">and </span>its results occur most readily to the mind.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-inhmc">The Whisper transcription is orthographic and thus ready to go - it’s formatted as we’d expect for a meeting transcription
or dictation script with both punctuation and casing. On the contrary, we would need to use additional post-processing
to restore punctuation and casing in our Wav2Vec2 predictions if we wanted to use it for downstream applications.</p> <p data-svelte-h="svelte-5l9h63">There is a happy medium between normalising and not normalising: we can train our systems on orthographic transcriptions,
and then normalise the predictions and targets before computing the WER. This way, we train our systems to predict fully
formatted text, but also benefit from the WER improvements we get by normalising the transcriptions.</p> <p data-svelte-h="svelte-ht49w6">The Whisper model was released with a normaliser that effectively handles the normalisation of casing, punctuation and
number formatting among others. Let’s apply the normaliser to the Whisper transcriptions to demonstrate how we can
normalise them:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers.models.whisper.english_normalizer <span class="hljs-keyword">import</span> BasicTextNormalizer
normalizer = BasicTextNormalizer()
prediction = <span class="hljs-string">&quot; He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind.&quot;</span>
normalized_prediction = normalizer(prediction)
normalized_prediction<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&#x27; he tells us <span class="hljs-keyword">that</span> <span class="hljs-keyword">at</span> this festive season <span class="hljs-keyword">of</span> <span class="hljs-keyword">the</span> <span class="hljs-built_in">year</span> <span class="hljs-keyword">with</span> christmas <span class="hljs-keyword">and</span> roast beef looming <span class="hljs-keyword">before</span> us similarly <span class="hljs-keyword">is</span> drawn <span class="hljs-keyword">from</span> eating <span class="hljs-keyword">and</span> <span class="hljs-keyword">its</span> results occur most readily <span class="hljs-keyword">to</span> <span class="hljs-keyword">the</span> mind &#x27;<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kbg2hm">Great! We can see that the text has been fully lower-cased and all punctuation removed. Let’s now define the reference
transcription and then compute the normalised WER between the reference and prediction:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->reference = <span class="hljs-string">&quot;HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND&quot;</span>
normalized_referece = normalizer(reference)
wer = wer_metric.compute(
references=[normalized_referece], predictions=[normalized_prediction]
)
wer<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">0</span>.<span class="hljs-number">0625</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mk0tzt">6.25% - that’s about what we’d expect for the Whisper base model on the LibriSpeech validation set. As we see here,
we’ve predicted an orthographic transcription, but benefited from the WER boost obtained by normalising the reference and
prediction prior to computing the WER.</p> <p data-svelte-h="svelte-pot7ah">The choice of how you normalise the transcriptions is ultimately down to your needs. We recommend training on
orthographic text and evaluating on normalised text to get the best of both worlds.</p> <h2 class="relative group"><a id="putting-it-all-together" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#putting-it-all-together"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Putting it all together</span></h2> <p data-svelte-h="svelte-a0jkyj">Alright! We’ve covered three topics so far in this Unit: pre-trained models, dataset selection and evaluation.
Let’s have some fun and put them together in one end-to-end example 🚀 We’re going to set ourselves up for the next
section on fine-tuning by evaluating the pre-trained Whisper model on the Common Voice 13 Dhivehi test set. We’ll use
the WER number we get as a <em>baseline</em> for our fine-tuning run, or a target number that we’ll try and beat 🥊</p> <p data-svelte-h="svelte-657gq0">First, we’ll load the pre-trained Whisper model using the <code>pipeline()</code> class. This process will be extremely familiar by now!
The only new thing we’ll do is load the model in half-precision (float16) if running on a GPU - this will speed up
inference at almost no cost to WER accuracy.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">if</span> torch.cuda.is_available():
device = <span class="hljs-string">&quot;cuda:0&quot;</span>
torch_dtype = torch.float16
<span class="hljs-keyword">else</span>:
device = <span class="hljs-string">&quot;cpu&quot;</span>
torch_dtype = torch.float32
pipe = pipeline(
<span class="hljs-string">&quot;automatic-speech-recognition&quot;</span>,
model=<span class="hljs-string">&quot;openai/whisper-small&quot;</span>,
torch_dtype=torch_dtype,
device=device,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1q149qh">Next, we’ll load the Dhivehi test split of Common Voice 13. You’ll remember from the previous section that the Common
Voice 13 is <em>gated</em>, meaning we had to agree to the dataset terms of use before gaining access to the dataset. We can
now link our Hugging Face account to our notebook, so that we have access to the dataset from the machine we’re currently
using.</p> <p data-svelte-h="svelte-dngs6s">Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted.
Find your Hub authentication token <a href="https://huggingface.co/settings/tokens" rel="nofollow">here</a> and enter it when prompted:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wi5uul">Great! Once we’ve linked the notebook to our Hugging Face account, we can proceed with downloading the Common Voice
dataset. This will take a few minutes to download and pre-process, fetching the data from the Hugging Face Hub and
preparing it automatically on your notebook:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
common_voice_test = load_dataset(
<span class="hljs-string">&quot;mozilla-foundation/common_voice_13_0&quot;</span>, <span class="hljs-string">&quot;dv&quot;</span>, split=<span class="hljs-string">&quot;test&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">If you face an authentication issue when loading the dataset, ensure that you have accepted the dataset&#39;s terms of use
on the Hugging Face Hub through the following link: https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0</div> <p data-svelte-h="svelte-y5ayq9">Evaluating over an entire dataset can be done in much the same way as over a single example - all we have to do is <strong>loop</strong>
over the input audios, rather than inferring just a single sample. To do this, we first transform our dataset into a
<code>KeyDataset</code>. All this does is pick out the particular dataset column that we want to forward to the model (in our case, that’s
the <code>&quot;audio&quot;</code> column), ignoring the rest (like the target transcriptions, which we don’t want to use for inference). We
then iterate over this transformed datasets, appending the model outputs to a list to save the predictions. The
following code cell will take approximately five minutes if running on a GPU with half-precision, peaking at 12GB memory:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tqdm <span class="hljs-keyword">import</span> tqdm
<span class="hljs-keyword">from</span> transformers.pipelines.pt_utils <span class="hljs-keyword">import</span> KeyDataset
all_predictions = []
<span class="hljs-comment"># run streamed inference</span>
<span class="hljs-keyword">for</span> prediction <span class="hljs-keyword">in</span> tqdm(
pipe(
KeyDataset(common_voice_test, <span class="hljs-string">&quot;audio&quot;</span>),
max_new_tokens=<span class="hljs-number">128</span>,
generate_kwargs={<span class="hljs-string">&quot;task&quot;</span>: <span class="hljs-string">&quot;transcribe&quot;</span>},
batch_size=<span class="hljs-number">32</span>,
),
total=<span class="hljs-built_in">len</span>(common_voice_test),
):
all_predictions.append(prediction[<span class="hljs-string">&quot;text&quot;</span>])<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">If you experience a CUDA out-of-memory (OOM) when running the above cell, incrementally reduce the `batch_size` by
factors of 2 until you find a batch size that fits your device.</div> <p data-svelte-h="svelte-13bou4y">And finally, we can compute the WER. Let’s first compute the orthographic WER, i.e. the WER without any post-processing:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> load
wer_metric = load(<span class="hljs-string">&quot;wer&quot;</span>)
wer_ortho = <span class="hljs-number">100</span> * wer_metric.compute(
references=common_voice_test[<span class="hljs-string">&quot;sentence&quot;</span>], predictions=all_predictions
)
wer_ortho<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">167</span>.<span class="hljs-number">29577268612022</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dzd4kp">Okay… 167% essentially means our model is outputting garbage 😜 Not to worry, it’ll be our aim to improve this by
fine-tuning the model on the Dhivehi training set!</p> <p data-svelte-h="svelte-ltuvl8">Next, we’ll evaluate the normalised WER, i.e. the WER with normalisation post-processing. We have to filter out samples
that would be empty after normalisation, as otherwise the total number of words in our reference (N) would be zero, which
would give a division by zero error in our calculation:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers.models.whisper.english_normalizer <span class="hljs-keyword">import</span> BasicTextNormalizer
normalizer = BasicTextNormalizer()
<span class="hljs-comment"># compute normalised WER</span>
all_predictions_norm = [normalizer(pred) <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> all_predictions]
all_references_norm = [normalizer(label) <span class="hljs-keyword">for</span> label <span class="hljs-keyword">in</span> common_voice_test[<span class="hljs-string">&quot;sentence&quot;</span>]]
<span class="hljs-comment"># filtering step to only evaluate the samples that correspond to non-zero references</span>
all_predictions_norm = [
all_predictions_norm[i]
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(all_predictions_norm))
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(all_references_norm[i]) &gt; <span class="hljs-number">0</span>
]
all_references_norm = [
all_references_norm[i]
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(all_references_norm))
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(all_references_norm[i]) &gt; <span class="hljs-number">0</span>
]
wer = <span class="hljs-number">100</span> * wer_metric.compute(
references=all_references_norm, predictions=all_predictions_norm
)
wer<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">125</span>.<span class="hljs-number">69809089960707</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sppy6n">Again we see the drastic reduction in WER we achieve by normalising our references and predictions: the baseline model
achieves an orthographic test WER of 168%, while the normalised WER is 126%.</p> <p data-svelte-h="svelte-pm696u">Right then! These are the numbers that we want to try and beat when we fine-tune the model, in order to improve the Whisper
model for Dhivehi speech recognition. Continue reading to get hands-on with a fine-tuning example 🚀</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/audio-transformers-course/blob/main/chapters/en/chapter5/evaluation.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_yq3w38 = {
assets: "/docs/audio-course/pr_201/en",
base: "/docs/audio-course/pr_201/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/audio-course/pr_201/en/_app/immutable/entry/start.367c4d78.js"),
import("/docs/audio-course/pr_201/en/_app/immutable/entry/app.4c54ebf9.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 31],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
72.4 kB
·
Xet hash:
d49f19a693480da9deef09c903e20f46dc85d6a9e8f0a9bf50f14c565875b46a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.