Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Pre-trained models and datasets for audio classification","local":"pre-trained-models-and-datasets-for-audio-classification","sections":[{"title":"🤗 Transformers Installation","local":"-transformers-installation","sections":[],"depth":2},{"title":"Keyword Spotting","local":"keyword-spotting","sections":[{"title":"Minds-14","local":"minds-14","sections":[],"depth":3},{"title":"Speech Commands","local":"speech-commands","sections":[],"depth":3}],"depth":2},{"title":"Language Identification","local":"language-identification","sections":[{"title":"FLEURS","local":"fleurs","sections":[],"depth":3}],"depth":2},{"title":"Zero-Shot Audio Classification","local":"zero-shot-audio-classification","sections":[],"depth":2},{"title":"What next?","local":"what-next","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/audio-course/pr_239/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/entry/start.1658692c.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/scheduler.cd324960.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/singletons.b42fc23b.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/index.a0c12d66.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/paths.cd0b54b2.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/entry/app.83f02103.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/preload-helper.7a3e7823.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/index.d5c3adcc.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/nodes/0.33fdfcd8.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/nodes/23.e2227195.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.f42929ed.js"> | |
| <link rel="modulepreload" href="/docs/audio-course/pr_239/en/_app/immutable/chunks/CodeBlock.f3dccfdb.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Pre-trained models and datasets for audio classification","local":"pre-trained-models-and-datasets-for-audio-classification","sections":[{"title":"🤗 Transformers Installation","local":"-transformers-installation","sections":[],"depth":2},{"title":"Keyword Spotting","local":"keyword-spotting","sections":[{"title":"Minds-14","local":"minds-14","sections":[],"depth":3},{"title":"Speech Commands","local":"speech-commands","sections":[],"depth":3}],"depth":2},{"title":"Language Identification","local":"language-identification","sections":[{"title":"FLEURS","local":"fleurs","sections":[],"depth":3}],"depth":2},{"title":"Zero-Shot Audio Classification","local":"zero-shot-audio-classification","sections":[],"depth":2},{"title":"What next?","local":"what-next","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="pre-trained-models-and-datasets-for-audio-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pre-trained-models-and-datasets-for-audio-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pre-trained models and datasets for audio classification</span></h1> <p data-svelte-h="svelte-1sx6iox">The Hugging Face Hub is home to over 500 pre-trained models for audio classification. In this section, we’ll go through | |
| some of the most common audio classification tasks and suggest appropriate pre-trained models for each. Using the <code>pipeline()</code> | |
| class, switching between models and tasks is straightforward - once you know how to use <code>pipeline()</code> for one model, you’ll | |
| be able to use it for any model on the Hub no code changes! This makes experimenting with the <code>pipeline()</code> class extremely | |
| fast, allowing you to quickly select the best pre-trained model for your needs.</p> <p data-svelte-h="svelte-1fbyaxi">Before we jump into the various audio classification problems, let’s quickly recap the transformer architectures typically | |
| used. The standard audio classification architecture is motivated by the nature of the task; we want to transform a sequence | |
| of audio inputs (i.e. our input audio array) into a single class label prediction. Encoder-only models first map the input | |
| audio sequence into a sequence of hidden-state representations by passing the inputs through a transformer block. The | |
| sequence of hidden-state representations is then mapped to a class label output by taking the mean over the hidden-states, | |
| and passing the resulting vector through a linear classification layer. Hence, there is a preference for <em>encoder-only</em> | |
| models for audio classification.</p> <p data-svelte-h="svelte-1webo38">Decoder-only models introduce unnecessary complexity to the task, since they assume that the outputs can also be a <em>sequence</em> | |
| of predictions (rather than a single class label prediction), and so generate multiple outputs. Therefore, they have slower | |
| inference speed and tend not to be used. Encoder-decoder models are largely omitted for the same reason. These architecture | |
| choices are analogous to those in NLP, where encoder-only models such as <a href="https://huggingface.co/blog/bert-101" rel="nofollow">BERT</a> | |
| are favoured for sequence classification tasks, and decoder-only models such as GPT reserved for sequence generation tasks.</p> <p data-svelte-h="svelte-rwwaij">Now that we’ve recapped the standard transformer architecture for audio classification, let’s jump into the different | |
| subsets of audio classification and cover the most popular models!</p> <h2 class="relative group"><a id="-transformers-installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-transformers-installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🤗 Transformers Installation</span></h2> <p data-svelte-h="svelte-1tugr7p">At the time of writing, the latest updates required for audio classification pipeline are only on the <code>main</code> version of | |
| the 🤗 Transformers repository, rather than the latest PyPi version. To make sure we have these updates locally, we’ll | |
| install Transformers from the <code>main</code> branch with the following command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install git+https:<span class="hljs-regexp">//gi</span>thub.com<span class="hljs-regexp">/huggingface/</span>transformers<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="keyword-spotting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#keyword-spotting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Keyword Spotting</span></h2> <p data-svelte-h="svelte-18ejjqs">Keyword spotting (KWS) is the task of identifying a keyword in a spoken utterance. The set of possible keywords forms the | |
| set of predicted class labels. Hence, to use a pre-trained keyword spotting model, you should ensure that your keywords | |
| match those that the model was pre-trained on. Below, we’ll introduce two datasets and models for keyword spotting.</p> <h3 class="relative group"><a id="minds-14" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#minds-14"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Minds-14</span></h3> <p data-svelte-h="svelte-1jsn8wa">Let’s go ahead and use the same <a href="https://huggingface.co/datasets/PolyAI/minds14" rel="nofollow">MINDS-14</a> dataset that you have explored | |
| in the previous unit. If you recall, MINDS-14 contains recordings of people asking an e-banking system questions in several | |
| languages and dialects, and has the <code>intent_class</code> for each recording. We can classify the recordings by intent of the call.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| minds = load_dataset(<span class="hljs-string">"PolyAI/minds14"</span>, name=<span class="hljs-string">"en-AU"</span>, split=<span class="hljs-string">"train"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1c1mr12">We’ll load the checkpoint <a href="https://huggingface.co/anton-l/xtreme_s_xlsr_300m_minds14" rel="nofollow"><code>"anton-l/xtreme_s_xlsr_300m_minds14"</code></a>, | |
| which is an XLS-R model fine-tuned on MINDS-14 for approximately 50 epochs. It achieves 90% accuracy over all languages | |
| from MINDS-14 on the evaluation set.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline | |
| classifier = pipeline( | |
| <span class="hljs-string">"audio-classification"</span>, | |
| model=<span class="hljs-string">"anton-l/xtreme_s_xlsr_300m_minds14"</span>, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xz16k">Finally, we can pass a sample to the classification pipeline to make a prediction:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->classifier(minds[<span class="hljs-number">0</span>][<span class="hljs-string">"audio"</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[ | |
| {<span class="hljs-string">"score"</span>: <span class="hljs-number">0.9631525278091431</span>, <span class="hljs-string">"label"</span>: <span class="hljs-string">"pay_bill"</span>}, | |
| {<span class="hljs-string">"score"</span>: <span class="hljs-number">0.02819698303937912</span>, <span class="hljs-string">"label"</span>: <span class="hljs-string">"freeze"</span>}, | |
| {<span class="hljs-string">"score"</span>: <span class="hljs-number">0.0032787492964416742</span>, <span class="hljs-string">"label"</span>: <span class="hljs-string">"card_issues"</span>}, | |
| {<span class="hljs-string">"score"</span>: <span class="hljs-number">0.0019414445850998163</span>, <span class="hljs-string">"label"</span>: <span class="hljs-string">"abroad"</span>}, | |
| {<span class="hljs-string">"score"</span>: <span class="hljs-number">0.0008378693601116538</span>, <span class="hljs-string">"label"</span>: <span class="hljs-string">"high_value_payment"</span>}, | |
| ]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-k1rlzh">Great! We’ve identified that the intent of the call was paying a bill, with probability 96%. You can imagine this kind of | |
| keyword spotting system being used as the first stage of an automated call centre, where we want to categorise incoming | |
| customer calls based on their query and offer them contextualised support accordingly.</p> <h3 class="relative group"><a id="speech-commands" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#speech-commands"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Speech Commands</span></h3> <p data-svelte-h="svelte-11rdezd">Speech Commands is a dataset of spoken words designed to evaluate audio classification models on simple command words. | |
| The dataset consists of 15 classes of keywords, a class for silence, and an unknown class to include the false positive. | |
| The 15 keywords are single words that would typically be used in on-device settings to control basic tasks or launch | |
| other processes.</p> <p data-svelte-h="svelte-1o56zbf">A similar model is running continuously on your mobile phone. Here, instead of having single command words, we have | |
| ‘wake words’ specific to your device, such as “Hey Google” or “Hey Siri”. When the audio classification model detects | |
| these wake words, it triggers your phone to start listening to the microphone and transcribe your speech using a speech | |
| recognition model.</p> <p data-svelte-h="svelte-ts3or1">The audio classification model is much smaller and lighter than the speech recognition model, often only several millions | |
| of parameters compared to several hundred millions for speech recognition. Thus, it can be run continuously on your device | |
| without draining your battery! Only when the wake word is detected is the larger speech recognition model launched, and | |
| afterwards it is shut down again. We’ll cover transformer models for speech recognition in the next Unit, so by the end | |
| of the course you should have the tools you need to build your own voice activated assistant!</p> <p data-svelte-h="svelte-fipl47">As with any dataset on the Hugging Face Hub, we can get a feel for the kind of audio data it has present without downloading | |
| or committing it memory. After heading to the <a href="https://huggingface.co/datasets/speech_commands" rel="nofollow">Speech Commands’ dataset card</a> | |
| on the Hub, we can use the Dataset Viewer to scroll through the first 100 samples of the dataset, listening to the audio | |
| files and checking any other metadata information:</p> <div class="flex justify-center" data-svelte-h="svelte-sk0ezl"><img src="https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/speech_commands.png" alt="Diagram of datasets viewer."></div> <p data-svelte-h="svelte-yyvm6u">The Dataset Preview is a brilliant way of experiencing audio datasets before committing to using them. You can pick any | |
| dataset on the Hub, scroll through the samples and listen to the audio for the different subsets and splits, gauging whether | |
| it’s the right dataset for your needs. Once you’ve selected a dataset, it’s trivial to load the data so that you can start | |
| using it.</p> <p data-svelte-h="svelte-p686we">Let’s do exactly that and load a sample of the Speech Commands dataset using streaming mode:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->speech_commands = load_dataset( | |
| <span class="hljs-string">"speech_commands"</span>, <span class="hljs-string">"v0.02"</span>, split=<span class="hljs-string">"validation"</span>, streaming=<span class="hljs-literal">True</span> | |
| ) | |
| sample = <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(speech_commands))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hz24qn">We’ll load an official <a href="https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer" rel="nofollow">Audio Spectrogram Transformer</a> | |
| checkpoint fine-tuned on the Speech Commands dataset, under the namespace <a href="https://huggingface.co/MIT/ast-finetuned-speech-commands-v2" rel="nofollow"><code>"MIT/ast-finetuned-speech-commands-v2"</code></a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->classifier = pipeline( | |
| <span class="hljs-string">"audio-classification"</span>, model=<span class="hljs-string">"MIT/ast-finetuned-speech-commands-v2"</span> | |
| ) | |
| classifier(sample[<span class="hljs-string">"audio"</span>].copy())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">0.9999892711639404</span>, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'backward</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">1.7504888774055871</span>e-06, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'happy</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">6.703040185129794</span>e-07, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'follow</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">5.805884484288981</span>e-07, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'stop</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">5.614546694232558</span>e-07, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'up</span>'}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kbnmt8">Cool! Looks like the example contains the word “backward” with high probability. We can take a listen to the sample | |
| and verify this is correct:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->from <span class="hljs-module-access"><span class="hljs-module"><span class="hljs-identifier">IPython</span>.</span></span>display import Audio | |
| <span class="hljs-constructor">Audio(<span class="hljs-params">sample</span>[<span class="hljs-string">"audio"</span>][<span class="hljs-string">"array"</span>], <span class="hljs-params">rate</span>=<span class="hljs-params">sample</span>[<span class="hljs-string">"audio"</span>][<span class="hljs-string">"sampling_rate"</span>])</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7botfj">Now, you might be wondering how we’ve selected these pre-trained models to show you in these audio classification examples. | |
| The truth is, finding pre-trained models for your dataset and task is very straightforward! The first thing we need to do | |
| is head to the Hugging Face Hub and click on the “Models” tab: <a href="https://huggingface.co/models" rel="nofollow">https://huggingface.co/models</a></p> <p data-svelte-h="svelte-csh5dy">This is going to bring up all the models on the Hugging Face Hub, sorted by downloads in the past 30 days:</p> <div class="flex justify-center" data-svelte-h="svelte-bqfzrn"><img src="https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/all_models.png"></div> <p data-svelte-h="svelte-6zblrj">You’ll notice on the left-hand side that we have a selection of tabs that we can select to filter models by task, library, | |
| dataset, etc. Scroll down and select the task “Audio Classification” from the list of audio tasks:</p> <div class="flex justify-center" data-svelte-h="svelte-1t2za6k"><img src="https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/by_audio_classification.png"></div> <p data-svelte-h="svelte-7t6r33">We’re now presented with the sub-set of 500+ audio classification models on the Hub. To further refine this selection, we | |
| can filter models by dataset. Click on the tab “Datasets”, and in the search box type “speech_commands”. As you begin typing, | |
| you’ll see the selection for <code>speech_commands</code> appear underneath the search tab. You can click this button to filter all | |
| audio classification models to those fine-tuned on the Speech Commands dataset:</p> <div class="flex justify-center" data-svelte-h="svelte-1sstogo"><img src="https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/by_speech_commands.png"></div> <p data-svelte-h="svelte-1jgbk0j">Great! We see that we have 6 pre-trained models available to us for this specific dataset and task. You’ll recognise the | |
| first of these models as the Audio Spectrogram Transformer checkpoint that we used in the previous example. This process | |
| of filtering models on the Hub is exactly how we went about selecting the checkpoint to show you!</p> <h2 class="relative group"><a id="language-identification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#language-identification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Language Identification</span></h2> <p data-svelte-h="svelte-5rskgo">Language identification (LID) is the task of identifying the language spoken in an audio sample from a list of candidate | |
| languages. LID can form an important part in many speech pipelines. For example, given an audio sample in an unknown language, | |
| an LID model can be used to categorise the language(s) spoken in the audio sample, and then select an appropriate speech | |
| recognition model trained on that language to transcribe the audio.</p> <h3 class="relative group"><a id="fleurs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fleurs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>FLEURS</span></h3> <p data-svelte-h="svelte-1n5euwf">FLEURS (Few-shot Learning Evaluation of Universal Representations of Speech) is a dataset for evaluating speech recognition | |
| systems in 102 languages, including many that are classified as ‘low-resource’. Take a look at the FLEURS dataset | |
| card on the Hub and explore the different languages that are present: <a href="https://huggingface.co/datasets/google/fleurs" rel="nofollow">google/fleurs</a>. | |
| Can you find your native tongue here? If not, what’s the most closely related language?</p> <p data-svelte-h="svelte-a6zngd">Let’s load up a sample from the validation split of the FLEURS dataset using streaming mode:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->fleurs = load_dataset(<span class="hljs-string">"google/fleurs"</span>, <span class="hljs-string">"all"</span>, split=<span class="hljs-string">"validation"</span>, streaming=<span class="hljs-literal">True</span>) | |
| sample = <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(fleurs))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g1l39x">Great! Now we can load our audio classification model. For this, we’ll use a version of <a href="https://arxiv.org/pdf/2212.04356.pdf" rel="nofollow">Whisper</a> | |
| fine-tuned on the FLEURS dataset, which is currently the most performant LID model on the Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->classifier = pipeline( | |
| <span class="hljs-string">"audio-classification"</span>, model=<span class="hljs-string">"sanchit-gandhi/whisper-medium-fleurs-lang-id"</span> | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fons3v">We can then pass the audio through our classifier and generate a prediction:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->classifier(sample[<span class="hljs-string">"audio"</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">0.9999330043792725</span>, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Afrikaans</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">7.093023668858223</span>e-06, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Northern-Sotho</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">4.269149485480739</span>e-06, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Icelandic</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">3.2661141631251667</span>e-06, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Danish</span>'}, | |
| {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">3.2580724109720904</span>e-06, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Cantonese</span> Chinese'}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1044ak6">We can see that the model predicted the audio was in Afrikaans with extremely high probability (near 1). The FLEURS dataset | |
| contains audio data from a wide range of languages - we can see that possible class labels include Northern-Sotho, Icelandic, | |
| Danish and Cantonese Chinese amongst others. You can find the full list of languages on the dataset card here: <a href="https://huggingface.co/datasets/google/fleurs" rel="nofollow">google/fleurs</a>.</p> <p data-svelte-h="svelte-857vsk">Over to you! What other checkpoints can you find for FLEURS LID on the Hub? What transformer models are they using under-the-hood?</p> <h2 class="relative group"><a id="zero-shot-audio-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-shot-audio-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Zero-Shot Audio Classification</span></h2> <p data-svelte-h="svelte-1wf33gz">In the traditional paradigm for audio classification, the model predicts a class label from a <em>pre-defined</em> set of | |
| possible classes. This poses a barrier to using pre-trained models for audio classification, since the label set of the | |
| pre-trained model must match that of the downstream task. For the previous example of LID, the model must predict one of | |
| the 102 langauge classes on which it was trained. If the downstream task actually requires 110 languages, the model would | |
| not be able to predict 8 of the 110 languages, and so would require re-training to achieve full coverage. This limits the | |
| effectiveness of transfer learning for audio classification tasks.</p> <p data-svelte-h="svelte-1bub4i">Zero-shot audio classification is a method for taking a pre-trained audio classification model trained on a set of labelled | |
| examples and enabling it to be able to classify new examples from previously unseen classes. Let’s take a look at how we | |
| can achieve this!</p> <p data-svelte-h="svelte-xv3v8">Currently, 🤗 Transformers supports one kind of model for zero-shot audio classification: the <a href="https://huggingface.co/docs/transformers/model_doc/clap" rel="nofollow">CLAP model</a>. | |
| CLAP is a transformer-based model that takes both audio and text as inputs, and computes the <em>similarity</em> between the two. | |
| If we pass a text input that strongly correlates with an audio input, we’ll get a high similarity score. Conversely, passing | |
| a text input that is completely unrelated to the audio input will return a low similarity.</p> <p data-svelte-h="svelte-yr149b">We can use this similarity prediction for zero-shot audio classification by passing one audio input to the model and | |
| multiple candidate labels. The model will return a similarity score for each of the candidate labels, and we can pick the | |
| one that has the highest score as our prediction.</p> <p data-svelte-h="svelte-1osll5p">Let’s take an example where we use one audio input from the <a href="https://huggingface.co/datasets/ashraq/esc50" rel="nofollow">Environmental Speech Challenge (ESC)</a> | |
| dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->dataset = load_dataset(<span class="hljs-string">"ashraq/esc50"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| audio_sample = <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(dataset))[<span class="hljs-string">"audio"</span>][<span class="hljs-string">"array"</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18w9coi">We then define our candidate labels, which form the set of possible classification labels. The model will return a | |
| classification probability for each of the labels we define. This means we need to know <em>a-priori</em> the set of possible | |
| labels in our classification problem, such that the correct label is contained within the set and is thus assigned a | |
| valid probability score. Note that we can either pass the full set of labels to the model, or a hand-selected subset | |
| that we believe contains the correct label. Passing the full set of labels is going to be more exhaustive, but comes | |
| at the expense of lower classification accuracy since the classification space is larger (provided the correct label is | |
| our chosen subset of labels):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->candidate_labels = [<span class="hljs-string">"Sound of a dog"</span>, <span class="hljs-string">"Sound of vacuum cleaner"</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-163k6s1">We can run both through the model to find the candidate label that is <em>most similar</em> to the audio input:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->classifier = pipeline( | |
| task=<span class="hljs-string">"zero-shot-audio-classification"</span>, model=<span class="hljs-string">"laion/clap-htsat-unfused"</span> | |
| ) | |
| classifier(audio_sample, candidate_labels=candidate_labels)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">0.9997242093086243</span>, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Sound</span> of a dog'}, {<span class="hljs-symbol">'score</span><span class="hljs-symbol">':</span> <span class="hljs-number">0.0002758323971647769</span>, <span class="hljs-symbol">'label</span><span class="hljs-symbol">':</span> <span class="hljs-symbol">'Sound</span> of vacuum cleaner'}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1slv1zu">Alright! The model seems pretty confident we have the sound of a dog - it predicts it with 99.96% probability, so we’ll | |
| take that as our prediction. Let’s confirm whether we were right by listening to the audio sample (don’t turn up your | |
| volume too high or else you might get a jump!):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Audio(audio_sample, rate=<span class="hljs-number">16000</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-125h3os">Perfect! We have the sound of a dog barking 🐕, which aligns with the model’s prediction. Have a play with different audio | |
| samples and different candidate labels - can you define a set of labels that give good generalisation across the ESC | |
| dataset? Hint: think about where you could find information on the possible sounds in ESC and construct your labels accordingly!</p> <p data-svelte-h="svelte-elwwua">You might be wondering why we don’t use the zero-shot audio classification pipeline for <strong>all</strong> audio classification tasks? | |
| It seems as though we can make predictions for any audio classification problem by defining appropriate class labels <em>a-priori</em>, | |
| thus bypassing the constraint that our classification task needs to match the labels that the model was pre-trained on. | |
| This comes down to the nature of the CLAP model used in the zero-shot pipeline: CLAP is pre-trained on <em>generic</em> audio | |
| classification data, similar to the environmental sounds in the ESC dataset, rather than specifically speech data, like | |
| we had in the LID task. If you gave it speech in English and speech in Spanish, CLAP would know that both examples were | |
| speech data 🗣️ But it wouldn’t be able to differentiate between the languages in the same way a dedicated LID model is | |
| able to.</p> <h2 class="relative group"><a id="what-next" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-next"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What next?</span></h2> <p data-svelte-h="svelte-1b8ovtt">We’ve covered a number of different audio classification tasks and presented the most relevant datasets and models that | |
| you can download from the Hugging Face Hub and use in just several lines of code using the <code>pipeline()</code> class. These tasks | |
| included keyword spotting, language identification and zero-shot audio classification.</p> <p data-svelte-h="svelte-1dkp3zb">But what if we want to do something <strong>new</strong>? We’ve worked extensively on speech processing tasks, but this is only one | |
| aspect of audio classification. Another popular field of audio processing involves <strong>music</strong>. While music has inherently | |
| different features to speech, many of the same principles that we’ve learnt about already can be applied to music.</p> <p data-svelte-h="svelte-nfuday">In the following section, we’ll go through a step-by-step guide on how you can fine-tune a transformer model with 🤗 | |
| Transformers on the task of music classification. By the end of it, you’ll have a fine-tuned checkpoint that you can plug | |
| into the <code>pipeline()</code> class, enabling you to classify songs in exactly the same way that we’ve classified speech here!</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/audio-transformers-course/blob/main/chapters/en/chapter4/classification_models.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1pbp10e = { | |
| assets: "/docs/audio-course/pr_239/en", | |
| base: "/docs/audio-course/pr_239/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/audio-course/pr_239/en/_app/immutable/entry/start.1658692c.js"), | |
| import("/docs/audio-course/pr_239/en/_app/immutable/entry/app.83f02103.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 23], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 66.1 kB
- Xet hash:
- 646c2d1f447a02c753b0afee833dcf6fbdc64c3e28554473af8a8fd50aa974da
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.