Buckets:

hf-doc-build/doc-dev / audio-course /pr_239 /es /chapter2 /audio_classification_pipeline.html
rtrm's picture
download
raw
20.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Clasificación de audio con una pipeline&quot;,&quot;local&quot;:&quot;clasificación-de-audio-con-una-pipeline&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}">
<link href="/docs/audio-course/pr_239/es/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/entry/start.e5641179.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/scheduler.a045fce0.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/singletons.7110d700.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/index.2447c7d9.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/paths.6f512667.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/entry/app.aeb8f25d.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/preload-helper.efc34262.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/index.bc14ef44.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/nodes/0.e1906c03.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/nodes/13.21af1dda.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.dd3847c1.js">
<link rel="modulepreload" href="/docs/audio-course/pr_239/es/_app/immutable/chunks/CodeBlock.39c0a862.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Clasificación de audio con una pipeline&quot;,&quot;local&quot;:&quot;clasificación-de-audio-con-una-pipeline&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="clasificación-de-audio-con-una-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#clasificación-de-audio-con-una-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Clasificación de audio con una pipeline</span></h1> <p data-svelte-h="svelte-k8g6jh">La clasificación de audio consiste en asignar o más etiquetas a una grabación de audio basado en su contenido. Las etiquetas
pueden corresponder a diferentes categorias del sonido, como música, voz o ruido, o etiquetas más especificas como canto de ave o
sonido de motor de carro.</p> <p data-svelte-h="svelte-opfu3e">Antes de entrar en los detalles sobre el funcionamiento de los transformers más populares para audio, y antes de hacer fine-tunning
de un modelo personalizado. Revisemos como se puede usar un modelo pre-entrenado para clasificación de audio con solo una lineas de
código de 🤗 Transformers.</p> <p data-svelte-h="svelte-1jrbmi0">Usemos el mismo conjunto de <a href="https://huggingface.co/datasets/PolyAI/minds14" rel="nofollow">MINDS-14</a> que hemos estado explorando en la
unidad anterior. Como recordarás, MINDS-14 contiene grabaciones de personas haciendo preguntas a un sistema de banca electrónica
en diferentes idiomas y dialectos, y tiene la etiqueta de <code>intent_class</code> para cada grabación. Podemos clasificar las grabaciones por
la intención de la llamada.</p> <p data-svelte-h="svelte-yr0mpi">Tal como hemos hecho antes, carguemos el subset de <code>en-AU</code> probar esta pipeline, y hagamos un upsampling a 16kHz que es
la frecuencia de muestreo que el modelo espera.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Audio
minds = load_dataset(<span class="hljs-string">&quot;PolyAI/minds14&quot;</span>, name=<span class="hljs-string">&quot;en-AU&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
minds = minds.cast_column(<span class="hljs-string">&quot;audio&quot;</span>, Audio(sampling_rate=<span class="hljs-number">16_000</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6lqry8">Para clasificar una grabación de audio en alguna de las clases, podemos usar la pipeline de <code>audio-classification</code> que tiene 🤗 Transformers.
En nuestro caso, necesitamos un modelo que ha sido entrenado para clasificar la intención en donde especificamente hayan la base de datos de
MINDS-14. Afortunadamente para nosotros, el Hub tiene un modelo que hace justamente esto. Carguemoslo usando la función <code>pipeline()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
classifier = pipeline(
<span class="hljs-string">&quot;audio-classification&quot;</span>,
model=<span class="hljs-string">&quot;anton-l/xtreme_s_xlsr_300m_minds14&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-72hnuq">Esta pipeline espera los datos de audio como un array de Numpy. Todo el preprocesamiento de los datos de audio será hecha por
la pipeline. Seleccionemos un ejemplo para probarlo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example = minds[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fv4joi">Si recuerdas la estructura del conjunto de datos, los datos en bruto de audio se almacenan en un array de Numpy en
la columna <code>[&quot;audio&quot;][&quot;array&quot;]</code>, que podemos pasar directamente al <code>classifier</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->classifier(example[<span class="hljs-string">&quot;audio&quot;</span>][<span class="hljs-string">&quot;array&quot;</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[
{<span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">0.9631525278091431</span>, <span class="hljs-string">&quot;label&quot;</span>: <span class="hljs-string">&quot;pay_bill&quot;</span>},
{<span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">0.02819698303937912</span>, <span class="hljs-string">&quot;label&quot;</span>: <span class="hljs-string">&quot;freeze&quot;</span>},
{<span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">0.0032787492964416742</span>, <span class="hljs-string">&quot;label&quot;</span>: <span class="hljs-string">&quot;card_issues&quot;</span>},
{<span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">0.0019414445850998163</span>, <span class="hljs-string">&quot;label&quot;</span>: <span class="hljs-string">&quot;abroad&quot;</span>},
{<span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">0.0008378693601116538</span>, <span class="hljs-string">&quot;label&quot;</span>: <span class="hljs-string">&quot;high_value_payment&quot;</span>},
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-50kvxb">Este modelo esta bastante seguro que la intención al llamar fue preguntar sobre el pago de una cuenta. Revisemos que la
etiqueta original para este ejemplo es:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->id2label = minds.features[<span class="hljs-string">&quot;intent_class&quot;</span>].int2str
id2label(example[<span class="hljs-string">&quot;intent_class&quot;</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mvdyro"><strong>Output:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;pay_bill&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lqv2vm">¡Hurra! la etiqueta predecida por el modelo ¡era la correcta! Aqui tuvimos suerte de encontrar un modelo que tenia exactamente
las etiquetas que necesitabamos. La mayoria de las veces, cuando estamos tratando con una tarea de clasificación, el conjunto de clases
de un modelo pre-entrenado no son exactamente las que necesitamos. En este caso, puedes hacer un fine-tunning del modelo pre entrenado
para “calibralo” a tu conjunto de clases. Aprenderemos como se puede hacer este proceso en las próximas unidad. Por ahora, echemos un
vistazo a otra tarea muy común en el procesamiento del habla, <em>automatic speech recognition</em>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/audio-transformers-course/blob/main/chapters/es/chapter2/audio_classification_pipeline.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1t9zdhx = {
assets: "/docs/audio-course/pr_239/es",
base: "/docs/audio-course/pr_239/es",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/audio-course/pr_239/es/_app/immutable/entry/start.e5641179.js"),
import("/docs/audio-course/pr_239/es/_app/immutable/entry/app.aeb8f25d.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 13],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
20.8 kB
·
Xet hash:
69db7f1ce83a0d0d00a4dbcd761b5a94eda2cbdebfbeb11dcfd26c76d36989d8

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.