Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"utils/audio","local":"utilsaudio","sections":[{"title":"utils/audio.read_audio(url, sampling_rate) ⇒ <code> Promise. < Float32Array > </code>","local":"utilsaudioreadaudiourl-samplingrate--code-promise--float32array--code","sections":[{"title":"read_audio~audio : <code> Float32Array </code>","local":"readaudioaudio--code-float32array-code","sections":[],"depth":3}],"depth":2},{"title":"utils/audio.hanning(M) ⇒ <code> Float64Array </code>","local":"utilsaudiohanningm--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio.hamming(M) ⇒ <code> Float64Array </code>","local":"utilsaudiohammingm--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio.mel_filter_bank(num_frequency_bins, num_mel_filters, min_frequency, max_frequency, sampling_rate, [norm], [mel_scale], [triangularize_in_mel_space]) ⇒ <code> Array. < Array < number > > </code>","local":"utilsaudiomelfilterbanknumfrequencybins-nummelfilters-minfrequency-maxfrequency-samplingrate-norm-melscale-triangularizeinmelspace--code-array--array--number---code","sections":[],"depth":2},{"title":"utils/audio.spectrogram(waveform, window, frame_length, hop_length, options) ⇒ <code> Promise. < Tensor > </code>","local":"utilsaudiospectrogramwaveform-window-framelength-hoplength-options--code-promise--tensor--code","sections":[],"depth":2},{"title":"utils/audio.window_function(window_length, name, options) ⇒ <code> Float64Array </code>","local":"utilsaudiowindowfunctionwindowlength-name-options--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio~generalized_cosine_window(M, a_0) ⇒ <code> Float64Array </code>","local":"utilsaudiogeneralizedcosinewindowm-a0--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio~hertz_to_mel(freq, [mel_scale]) ⇒ <code> T </code>","local":"utilsaudiohertztomelfreq-melscale--code-t-code","sections":[],"depth":2},{"title":"utils/audio~mel_to_hertz(mels, [mel_scale]) ⇒ <code> T </code>","local":"utilsaudiomeltohertzmels-melscale--code-t-code","sections":[],"depth":2},{"title":"utils/audio~_create_triangular_filter_bank(fft_freqs, filter_freqs) ⇒ <code> Array. < Array < number > > </code>","local":"utilsaudiocreatetriangularfilterbankfftfreqs-filterfreqs--code-array--array--number---code","sections":[],"depth":2},{"title":"utils/audio~linspace(start, end, num) ⇒","local":"utilsaudiolinspacestart-end-num-","sections":[],"depth":2},{"title":"utils/audio~padReflect(array, left, right) ⇒ <code> T </code>","local":"utilsaudiopadreflectarray-left-right--code-t-code","sections":[],"depth":2},{"title":"utils/audio~_db_conversion_helper(spectrogram, factor, reference, min_value, db_range) ⇒ <code> T </code>","local":"utilsaudiodbconversionhelperspectrogram-factor-reference-minvalue-dbrange--code-t-code","sections":[],"depth":2},{"title":"utils/audio~amplitude_to_db(spectrogram, [reference], [min_value], [db_range]) ⇒ <code> T </code>","local":"utilsaudioamplitudetodbspectrogram-reference-minvalue-dbrange--code-t-code","sections":[],"depth":2},{"title":"utils/audio~power_to_db(spectrogram, [reference], [min_value], [db_range]) ⇒ <code> T </code>","local":"utilsaudiopowertodbspectrogram-reference-minvalue-dbrange--code-t-code","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/transformers.js/pr_1113/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/entry/start.88a6e140.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/chunks/scheduler.0219f8bd.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/chunks/singletons.c59c6d8d.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/chunks/paths.8e090985.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/entry/app.0003020d.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/chunks/index.f61edf3b.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/nodes/0.25c65ab2.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/nodes/16.87ff33e9.js"> | |
| <link rel="modulepreload" href="/docs/transformers.js/pr_1113/en/_app/immutable/chunks/EditOnGithub.48fa589f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"utils/audio","local":"utilsaudio","sections":[{"title":"utils/audio.read_audio(url, sampling_rate) ⇒ <code> Promise. < Float32Array > </code>","local":"utilsaudioreadaudiourl-samplingrate--code-promise--float32array--code","sections":[{"title":"read_audio~audio : <code> Float32Array </code>","local":"readaudioaudio--code-float32array-code","sections":[],"depth":3}],"depth":2},{"title":"utils/audio.hanning(M) ⇒ <code> Float64Array </code>","local":"utilsaudiohanningm--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio.hamming(M) ⇒ <code> Float64Array </code>","local":"utilsaudiohammingm--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio.mel_filter_bank(num_frequency_bins, num_mel_filters, min_frequency, max_frequency, sampling_rate, [norm], [mel_scale], [triangularize_in_mel_space]) ⇒ <code> Array. < Array < number > > </code>","local":"utilsaudiomelfilterbanknumfrequencybins-nummelfilters-minfrequency-maxfrequency-samplingrate-norm-melscale-triangularizeinmelspace--code-array--array--number---code","sections":[],"depth":2},{"title":"utils/audio.spectrogram(waveform, window, frame_length, hop_length, options) ⇒ <code> Promise. < Tensor > </code>","local":"utilsaudiospectrogramwaveform-window-framelength-hoplength-options--code-promise--tensor--code","sections":[],"depth":2},{"title":"utils/audio.window_function(window_length, name, options) ⇒ <code> Float64Array </code>","local":"utilsaudiowindowfunctionwindowlength-name-options--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio~generalized_cosine_window(M, a_0) ⇒ <code> Float64Array </code>","local":"utilsaudiogeneralizedcosinewindowm-a0--code-float64array-code","sections":[],"depth":2},{"title":"utils/audio~hertz_to_mel(freq, [mel_scale]) ⇒ <code> T </code>","local":"utilsaudiohertztomelfreq-melscale--code-t-code","sections":[],"depth":2},{"title":"utils/audio~mel_to_hertz(mels, [mel_scale]) ⇒ <code> T </code>","local":"utilsaudiomeltohertzmels-melscale--code-t-code","sections":[],"depth":2},{"title":"utils/audio~_create_triangular_filter_bank(fft_freqs, filter_freqs) ⇒ <code> Array. < Array < number > > </code>","local":"utilsaudiocreatetriangularfilterbankfftfreqs-filterfreqs--code-array--array--number---code","sections":[],"depth":2},{"title":"utils/audio~linspace(start, end, num) ⇒","local":"utilsaudiolinspacestart-end-num-","sections":[],"depth":2},{"title":"utils/audio~padReflect(array, left, right) ⇒ <code> T </code>","local":"utilsaudiopadreflectarray-left-right--code-t-code","sections":[],"depth":2},{"title":"utils/audio~_db_conversion_helper(spectrogram, factor, reference, min_value, db_range) ⇒ <code> T </code>","local":"utilsaudiodbconversionhelperspectrogram-factor-reference-minvalue-dbrange--code-t-code","sections":[],"depth":2},{"title":"utils/audio~amplitude_to_db(spectrogram, [reference], [min_value], [db_range]) ⇒ <code> T </code>","local":"utilsaudioamplitudetodbspectrogram-reference-minvalue-dbrange--code-t-code","sections":[],"depth":2},{"title":"utils/audio~power_to_db(spectrogram, [reference], [min_value], [db_range]) ⇒ <code> T </code>","local":"utilsaudiopowertodbspectrogram-reference-minvalue-dbrange--code-t-code","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <a id="module_utils/audio" class="group"></a> <h1 class="relative group"><a id="utilsaudio" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudio"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio</span></h1> <p data-svelte-h="svelte-12buzkq">Helper module for audio processing.</p> <p data-svelte-h="svelte-il3kl8">These functions and classes are only used internally, | |
| meaning an end-user shouldn’t need to access anything here.</p> <ul data-svelte-h="svelte-hrpzfw"><li><a href="#module_utils/audio">utils/audio</a><ul><li><em>static</em><ul><li><a href="#module_utils/audio.read_audio"><code>.read_audio(url, sampling_rate)</code></a> ⇒ <code>Promise.<Float32Array></code><ul><li><a href="#module_utils/audio.read_audio..audio"><code>~audio</code></a> : <code>Float32Array</code></li></ul></li> <li><a href="#module_utils/audio.hanning"><code>.hanning(M)</code></a> ⇒ <code>Float64Array</code></li> <li><a href="#module_utils/audio.hamming"><code>.hamming(M)</code></a> ⇒ <code>Float64Array</code></li> <li><a href="#module_utils/audio.mel_filter_bank"><code>.mel_filter_bank(num_frequency_bins, num_mel_filters, min_frequency, max_frequency, sampling_rate, [norm], [mel_scale], [triangularize_in_mel_space])</code></a> ⇒ <code>Array.<Array<number>></code></li> <li><a href="#module_utils/audio.spectrogram"><code>.spectrogram(waveform, window, frame_length, hop_length, options)</code></a> ⇒ <a href="#Tensor"><code>Promise.<Tensor></code></a></li> <li><a href="#module_utils/audio.window_function"><code>.window_function(window_length, name, options)</code></a> ⇒ <code>Float64Array</code></li></ul></li> <li><em>inner</em><ul><li><a href="#module_utils/audio..generalized_cosine_window"><code>~generalized_cosine_window(M, a_0)</code></a> ⇒ <code>Float64Array</code></li> <li><a href="#module_utils/audio..hertz_to_mel"><code>~hertz_to_mel(freq, [mel_scale])</code></a> ⇒ <code>T</code></li> <li><a href="#module_utils/audio..mel_to_hertz"><code>~mel_to_hertz(mels, [mel_scale])</code></a> ⇒ <code>T</code></li> <li><a href="#module_utils/audio.._create_triangular_filter_bank"><code>~_create_triangular_filter_bank(fft_freqs, filter_freqs)</code></a> ⇒ <code>Array.<Array<number>></code></li> <li><a href="#module_utils/audio..linspace"><code>~linspace(start, end, num)</code></a> ⇒</li> <li><a href="#module_utils/audio..padReflect"><code>~padReflect(array, left, right)</code></a> ⇒ <code>T</code></li> <li><a href="#module_utils/audio.._db_conversion_helper"><code>~_db_conversion_helper(spectrogram, factor, reference, min_value, db_range)</code></a> ⇒ <code>T</code></li> <li><a href="#module_utils/audio..amplitude_to_db"><code>~amplitude_to_db(spectrogram, [reference], [min_value], [db_range])</code></a> ⇒ <code>T</code></li> <li><a href="#module_utils/audio..power_to_db"><code>~power_to_db(spectrogram, [reference], [min_value], [db_range])</code></a> ⇒ <code>T</code></li></ul></li></ul></li></ul> <hr> <a id="module_utils/audio.read_audio" class="group"></a> <h2 class="relative group"><a id="utilsaudioreadaudiourl-samplingrate--code-promise--float32array--code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudioreadaudiourl-samplingrate--code-promise--float32array--code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio.read_audio(url, sampling_rate) ⇒ <code> Promise. < Float32Array > </code></span></h2> <p data-svelte-h="svelte-wq5n0b">Helper function to read audio from a path/URL.</p> <p data-svelte-h="svelte-wzwr9x"><strong>Kind</strong>: static method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Promise.<Float32Array></code> - The decoded audio as a <code>Float32Array</code>.</p> <table data-svelte-h="svelte-1u23bjg"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>url</td><td><code>string</code> | <code>URL</code></td><td><p>The path/URL to load the audio from.</p></td> </tr><tr><td>sampling_rate</td><td><code>number</code></td><td><p>The sampling rate to use when decoding the audio.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio.read_audio..audio" class="group"></a> <h3 class="relative group"><a id="readaudioaudio--code-float32array-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#readaudioaudio--code-float32array-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>read_audio~audio : <code> Float32Array </code></span></h3> <p data-svelte-h="svelte-kcz1h5"><strong>Kind</strong>: inner property of <a href="#module_utils/audio.read_audio"><code>read_audio</code></a></p> <hr> <a id="module_utils/audio.hanning" class="group"></a> <h2 class="relative group"><a id="utilsaudiohanningm--code-float64array-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiohanningm--code-float64array-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio.hanning(M) ⇒ <code> Float64Array </code></span></h2> <p data-svelte-h="svelte-13ns2vg">Generates a Hanning window of length M. | |
| See <a href="https://numpy.org/doc/stable/reference/generated/numpy.hanning.html" rel="nofollow">https://numpy.org/doc/stable/reference/generated/numpy.hanning.html</a> for more information.</p> <p data-svelte-h="svelte-187ugov"><strong>Kind</strong>: static method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Float64Array</code> - The generated Hanning window.</p> <table data-svelte-h="svelte-1x4zay3"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>M</td><td><code>number</code></td><td><p>The length of the Hanning window to generate.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio.hamming" class="group"></a> <h2 class="relative group"><a id="utilsaudiohammingm--code-float64array-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiohammingm--code-float64array-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio.hamming(M) ⇒ <code> Float64Array </code></span></h2> <p data-svelte-h="svelte-8gymva">Generates a Hamming window of length M. | |
| See <a href="https://numpy.org/doc/stable/reference/generated/numpy.hamming.html" rel="nofollow">https://numpy.org/doc/stable/reference/generated/numpy.hamming.html</a> for more information.</p> <p data-svelte-h="svelte-1dyre01"><strong>Kind</strong>: static method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Float64Array</code> - The generated Hamming window.</p> <table data-svelte-h="svelte-io80jx"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>M</td><td><code>number</code></td><td><p>The length of the Hamming window to generate.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio.mel_filter_bank" class="group"></a> <h2 class="relative group"><a id="utilsaudiomelfilterbanknumfrequencybins-nummelfilters-minfrequency-maxfrequency-samplingrate-norm-melscale-triangularizeinmelspace--code-array--array--number---code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiomelfilterbanknumfrequencybins-nummelfilters-minfrequency-maxfrequency-samplingrate-norm-melscale-triangularizeinmelspace--code-array--array--number---code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio.mel_filter_bank(num_frequency_bins, num_mel_filters, min_frequency, max_frequency, sampling_rate, [norm], [mel_scale], [triangularize_in_mel_space]) ⇒ <code> Array. < Array < number > > </code></span></h2> <p data-svelte-h="svelte-ke4v6b">Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a <em>mel filter bank</em>, and | |
| various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters | |
| are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these | |
| features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.</p> <p data-svelte-h="svelte-rtofow"><strong>Kind</strong>: static method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Array.<Array<number>></code> - Triangular filter bank matrix, which is a 2D array of shape (<code>num_frequency_bins</code>, <code>num_mel_filters</code>). | |
| This is a projection matrix to go from a spectrogram to a mel spectrogram.</p> <table data-svelte-h="svelte-t1r9o1"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>num_frequency_bins</td><td><code>number</code></td><td><p>Number of frequencies used to compute the spectrogram (should be the same as in <code>stft</code>).</p></td> </tr><tr><td>num_mel_filters</td><td><code>number</code></td><td><p>Number of mel filters to generate.</p></td> </tr><tr><td>min_frequency</td><td><code>number</code></td><td><p>Lowest frequency of interest in Hz.</p></td> </tr><tr><td>max_frequency</td><td><code>number</code></td><td><p>Highest frequency of interest in Hz. This should not exceed <code>sampling_rate / 2</code>.</p></td> </tr><tr><td>sampling_rate</td><td><code>number</code></td><td><p>Sample rate of the audio waveform.</p></td> </tr><tr><td>[norm]</td><td><code>string</code></td><td><p>If <code>"slaney"</code>, divide the triangular mel weights by the width of the mel band (area normalization).</p></td> </tr><tr><td>[mel_scale]</td><td><code>string</code></td><td><p>The mel frequency scale to use, <code>"htk"</code> or <code>"slaney"</code>.</p></td> </tr><tr><td>[triangularize_in_mel_space]</td><td><code>boolean</code></td><td><p>If this option is enabled, the triangular filter is applied in mel space rather than frequency space. | |
| This should be set to <code>true</code> in order to get the same results as <code>torchaudio</code> when computing mel filters.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio.spectrogram" class="group"></a> <h2 class="relative group"><a id="utilsaudiospectrogramwaveform-window-framelength-hoplength-options--code-promise--tensor--code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiospectrogramwaveform-window-framelength-hoplength-options--code-promise--tensor--code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio.spectrogram(waveform, window, frame_length, hop_length, options) ⇒ <code> Promise. < Tensor > </code></span></h2> <p data-svelte-h="svelte-10mwufd">Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.</p> <p data-svelte-h="svelte-zfp2ya">This function can create the following kinds of spectrograms:</p> <ul data-svelte-h="svelte-14mziv8"><li>amplitude spectrogram (<code>power = 1.0</code>)</li> <li>power spectrogram (<code>power = 2.0</code>)</li> <li>complex-valued spectrogram (<code>power = None</code>)</li> <li>log spectrogram (use <code>log_mel</code> argument)</li> <li>mel spectrogram (provide <code>mel_filters</code>)</li> <li>log-mel spectrogram (provide <code>mel_filters</code> and <code>log_mel</code>)</li></ul> <p data-svelte-h="svelte-iwx6w9">In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. | |
| A padded window can be obtained from <code>window_function()</code>. The FFT input buffer may be larger than the analysis frame, | |
| typically the next power of two.</p> <p data-svelte-h="svelte-1shoq4k"><strong>Kind</strong>: static method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <a href="#Tensor"><code>Promise.<Tensor></code></a> - Spectrogram of shape <code>(num_frequency_bins, length)</code> (regular spectrogram) or shape <code>(num_mel_filters, length)</code> (mel spectrogram).</p> <table data-svelte-h="svelte-roqmrn"><thead><tr><th>Param</th><th>Type</th><th>Default</th><th>Description</th></tr></thead> <tbody><tr><td>waveform</td><td><code>Float32Array</code> | <code>Float64Array</code></td><td></td><td><p>The input waveform of shape <code>(length,)</code>. This must be a single real-valued, mono waveform.</p></td> </tr><tr><td>window</td><td><code>Float32Array</code> | <code>Float64Array</code></td><td></td><td><p>The windowing function to apply of shape <code>(frame_length,)</code>, including zero-padding if necessary. The actual window length may be | |
| shorter than <code>frame_length</code>, but we're assuming the array has already been zero-padded.</p></td> </tr><tr><td>frame_length</td><td><code>number</code></td><td></td><td><p>The length of the analysis frames in samples (a.k.a., <code>fft_length</code>).</p></td> </tr><tr><td>hop_length</td><td><code>number</code></td><td></td><td><p>The stride between successive analysis frames in samples.</p></td> </tr><tr><td>options</td><td><code>Object</code></td><td></td><td></td> </tr><tr><td>[options.fft_length]</td><td><code>number</code></td><td><code></code></td><td><p>The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have. | |
| For optimal speed, this should be a power of two. If <code>null</code>, uses <code>frame_length</code>.</p></td> </tr><tr><td>[options.power]</td><td><code>number</code></td><td><code>1.0</code></td><td><p>If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If <code>null</code>, returns complex numbers.</p></td> </tr><tr><td>[options.center]</td><td><code>boolean</code></td><td><code>true</code></td><td><p>Whether to pad the waveform so that frame <code>t</code> is centered around time <code>t * hop_length</code>. If <code>false</code>, frame | |
| <code>t</code> will start at time <code>t * hop_length</code>.</p></td> </tr><tr><td>[options.pad_mode]</td><td><code>string</code></td><td><code>""reflect""</code></td><td><p>Padding mode used when <code>center</code> is <code>true</code>. Possible values are: <code>"constant"</code> (pad with zeros), | |
| <code>"edge"</code> (pad with edge values), <code>"reflect"</code> (pads with mirrored values).</p></td> </tr><tr><td>[options.onesided]</td><td><code>boolean</code></td><td><code>true</code></td><td><p>If <code>true</code>, only computes the positive frequencies and returns a spectrogram containing <code>fft_length // 2 + 1</code> | |
| frequency bins. If <code>false</code>, also computes the negative frequencies and returns <code>fft_length</code> frequency bins.</p></td> </tr><tr><td>[options.preemphasis]</td><td><code>number</code></td><td><code></code></td><td><p>Coefficient for a low-pass filter that applies pre-emphasis before the DFT.</p></td> </tr><tr><td>[options.mel_filters]</td><td><code>Array.<Array<number>></code></td><td><code></code></td><td><p>The mel filter bank of shape <code>(num_freq_bins, num_mel_filters)</code>. | |
| If supplied, applies this filter bank to create a mel spectrogram.</p></td> </tr><tr><td>[options.mel_floor]</td><td><code>number</code></td><td><code>1e-10</code></td><td><p>Minimum value of mel frequency banks.</p></td> </tr><tr><td>[options.log_mel]</td><td><code>string</code></td><td><code>null</code></td><td><p>How to convert the spectrogram to log scale. Possible options are: | |
| <code>null</code> (don't convert), <code>"log"</code> (take the natural logarithm) <code>"log10"</code> (take the base-10 logarithm), <code>"dB"</code> (convert to decibels). | |
| Can only be used when <code>power</code> is not <code>null</code>.</p></td> </tr><tr><td>[options.reference]</td><td><code>number</code></td><td><code>1.0</code></td><td><p>Sets the input spectrogram value that corresponds to 0 dB. For example, use <code>max(spectrogram)[0]</code> to set | |
| the loudest part to 0 dB. Must be greater than zero.</p></td> </tr><tr><td>[options.min_value]</td><td><code>number</code></td><td><code>1e-10</code></td><td><p>The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking <code>log(0)</code>. | |
| For a power spectrogram, the default of <code>1e-10</code> corresponds to a minimum of -100 dB. For an amplitude spectrogram, the value <code>1e-5</code> corresponds to -100 dB. | |
| Must be greater than zero.</p></td> </tr><tr><td>[options.db_range]</td><td><code>number</code></td><td><code></code></td><td><p>Sets the maximum dynamic range in decibels. For example, if <code>db_range = 80</code>, the difference between the | |
| peak value and the smallest value will never be more than 80 dB. Must be greater than zero.</p></td> </tr><tr><td>[options.remove_dc_offset]</td><td><code>boolean</code></td><td><code></code></td><td><p>Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to <code>true</code> in | |
| order to get the same results as <code>torchaudio.compliance.kaldi.fbank</code> when computing mel filters.</p></td> </tr><tr><td>[options.max_num_frames]</td><td><code>number</code></td><td><code></code></td><td><p>If provided, limits the number of frames to compute to this value.</p></td> </tr><tr><td>[options.min_num_frames]</td><td><code>number</code></td><td><code></code></td><td><p>If provided, ensures the number of frames to compute is at least this value.</p></td> </tr><tr><td>[options.do_pad]</td><td><code>boolean</code></td><td><code>true</code></td><td><p>If <code>true</code>, pads the output spectrogram to have <code>max_num_frames</code> frames.</p></td> </tr><tr><td>[options.transpose]</td><td><code>boolean</code></td><td><code>false</code></td><td><p>If <code>true</code>, the returned spectrogram will have shape <code>(num_frames, num_frequency_bins/num_mel_filters)</code>. If <code>false</code>, the returned spectrogram will have shape <code>(num_frequency_bins/num_mel_filters, num_frames)</code>.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio.window_function" class="group"></a> <h2 class="relative group"><a id="utilsaudiowindowfunctionwindowlength-name-options--code-float64array-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiowindowfunctionwindowlength-name-options--code-float64array-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio.window_function(window_length, name, options) ⇒ <code> Float64Array </code></span></h2> <p data-svelte-h="svelte-13l1rbm">Returns an array containing the specified window.</p> <p data-svelte-h="svelte-9ixqdv"><strong>Kind</strong>: static method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Float64Array</code> - The window of shape <code>(window_length,)</code> or <code>(frame_length,)</code>.</p> <table data-svelte-h="svelte-h693em"><thead><tr><th>Param</th><th>Type</th><th>Default</th><th>Description</th></tr></thead> <tbody><tr><td>window_length</td><td><code>number</code></td><td></td><td><p>The length of the window in samples.</p></td> </tr><tr><td>name</td><td><code>string</code></td><td></td><td><p>The name of the window function.</p></td> </tr><tr><td>options</td><td><code>Object</code></td><td></td><td><p>Additional options.</p></td> </tr><tr><td>[options.periodic]</td><td><code>boolean</code></td><td><code>true</code></td><td><p>Whether the window is periodic or symmetric.</p></td> </tr><tr><td>[options.frame_length]</td><td><code>number</code></td><td><code></code></td><td><p>The length of the analysis frames in samples. | |
| Provide a value for <code>frame_length</code> if the window is smaller than the frame length, so that it will be zero-padded.</p></td> </tr><tr><td>[options.center]</td><td><code>boolean</code></td><td><code>true</code></td><td><p>Whether to center the window inside the FFT buffer. Only used when <code>frame_length</code> is provided.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio..generalized_cosine_window" class="group"></a> <h2 class="relative group"><a id="utilsaudiogeneralizedcosinewindowm-a0--code-float64array-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiogeneralizedcosinewindowm-a0--code-float64array-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~generalized_cosine_window(M, a_0) ⇒ <code> Float64Array </code></span></h2> <p data-svelte-h="svelte-1lyeqxj">Helper function to generate windows that are special cases of the generalized cosine window. | |
| See <a href="https://www.mathworks.com/help/signal/ug/generalized-cosine-windows.html" rel="nofollow">https://www.mathworks.com/help/signal/ug/generalized-cosine-windows.html</a> for more information.</p> <p data-svelte-h="svelte-xu9700"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Float64Array</code> - The generated window.</p> <table data-svelte-h="svelte-11w0boh"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>M</td><td><code>number</code></td><td><p>Number of points in the output window. If zero or less, an empty array is returned.</p></td> </tr><tr><td>a_0</td><td><code>number</code></td><td><p>Offset for the generalized cosine window.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio..hertz_to_mel" class="group"></a> <h2 class="relative group"><a id="utilsaudiohertztomelfreq-melscale--code-t-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiohertztomelfreq-melscale--code-t-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~hertz_to_mel(freq, [mel_scale]) ⇒ <code> T </code></span></h2> <p data-svelte-h="svelte-5cebk5"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a></p> <table data-svelte-h="svelte-1w6ehfm"><thead><tr><th>Param</th><th>Type</th><th>Default</th></tr></thead> <tbody><tr><td>freq</td><td><code>T</code></td><td></td> </tr><tr><td>[mel_scale]</td><td><code>string</code></td><td><code>"htk"</code></td></tr></tbody></table> <hr> <a id="module_utils/audio..mel_to_hertz" class="group"></a> <h2 class="relative group"><a id="utilsaudiomeltohertzmels-melscale--code-t-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiomeltohertzmels-melscale--code-t-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~mel_to_hertz(mels, [mel_scale]) ⇒ <code> T </code></span></h2> <p data-svelte-h="svelte-5cebk5"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a></p> <table data-svelte-h="svelte-io7ocd"><thead><tr><th>Param</th><th>Type</th><th>Default</th></tr></thead> <tbody><tr><td>mels</td><td><code>T</code></td><td></td> </tr><tr><td>[mel_scale]</td><td><code>string</code></td><td><code>"htk"</code></td></tr></tbody></table> <hr> <a id="module_utils/audio.._create_triangular_filter_bank" class="group"></a> <h2 class="relative group"><a id="utilsaudiocreatetriangularfilterbankfftfreqs-filterfreqs--code-array--array--number---code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiocreatetriangularfilterbankfftfreqs-filterfreqs--code-array--array--number---code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~_create_triangular_filter_bank(fft_freqs, filter_freqs) ⇒ <code> Array. < Array < number > > </code></span></h2> <p data-svelte-h="svelte-uj0il7">Creates a triangular filter bank.</p> <p data-svelte-h="svelte-srmfwi">Adapted from torchaudio and librosa.</p> <p data-svelte-h="svelte-aln42z"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>Array.<Array<number>></code> - of shape <code>(num_frequency_bins, num_mel_filters)</code>.</p> <table data-svelte-h="svelte-2x9dyx"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>fft_freqs</td><td><code>Float64Array</code></td><td><p>Discrete frequencies of the FFT bins in Hz, of shape <code>(num_frequency_bins,)</code>.</p></td> </tr><tr><td>filter_freqs</td><td><code>Float64Array</code></td><td><p>Center frequencies of the triangular filters to create, in Hz, of shape <code>(num_mel_filters,)</code>.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio..linspace" class="group"></a> <h2 class="relative group"><a id="utilsaudiolinspacestart-end-num-" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiolinspacestart-end-num-"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~linspace(start, end, num) ⇒</span></h2> <p data-svelte-h="svelte-osjtvn">Return evenly spaced numbers over a specified interval.</p> <p data-svelte-h="svelte-bfmjcl"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>num</code> evenly spaced samples, calculated over the interval <code>[start, stop]</code>.</p> <table data-svelte-h="svelte-c7ztlp"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>start</td><td><code>number</code></td><td><p>The starting value of the sequence.</p></td> </tr><tr><td>end</td><td><code>number</code></td><td><p>The end value of the sequence.</p></td> </tr><tr><td>num</td><td><code>number</code></td><td><p>Number of samples to generate.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio..padReflect" class="group"></a> <h2 class="relative group"><a id="utilsaudiopadreflectarray-left-right--code-t-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiopadreflectarray-left-right--code-t-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~padReflect(array, left, right) ⇒ <code> T </code></span></h2> <p data-svelte-h="svelte-115nnpl"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>T</code> - The padded array.</p> <table data-svelte-h="svelte-6p81mk"><thead><tr><th>Param</th><th>Type</th><th>Description</th></tr></thead> <tbody><tr><td>array</td><td><code>T</code></td><td><p>The array to pad.</p></td> </tr><tr><td>left</td><td><code>number</code></td><td><p>The amount of padding to add to the left.</p></td> </tr><tr><td>right</td><td><code>number</code></td><td><p>The amount of padding to add to the right.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio.._db_conversion_helper" class="group"></a> <h2 class="relative group"><a id="utilsaudiodbconversionhelperspectrogram-factor-reference-minvalue-dbrange--code-t-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiodbconversionhelperspectrogram-factor-reference-minvalue-dbrange--code-t-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~_db_conversion_helper(spectrogram, factor, reference, min_value, db_range) ⇒ <code> T </code></span></h2> <p data-svelte-h="svelte-9t3wul">Helper function to compute <code>amplitude_to_db</code> and <code>power_to_db</code>.</p> <p data-svelte-h="svelte-5cebk5"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a></p> <table data-svelte-h="svelte-heeupf"><thead><tr><th>Param</th><th>Type</th></tr></thead> <tbody><tr><td>spectrogram</td><td><code>T</code></td> </tr><tr><td>factor</td><td><code>number</code></td> </tr><tr><td>reference</td><td><code>number</code></td> </tr><tr><td>min_value</td><td><code>number</code></td> </tr><tr><td>db_range</td><td><code>number</code></td></tr></tbody></table> <hr> <a id="module_utils/audio..amplitude_to_db" class="group"></a> <h2 class="relative group"><a id="utilsaudioamplitudetodbspectrogram-reference-minvalue-dbrange--code-t-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudioamplitudetodbspectrogram-reference-minvalue-dbrange--code-t-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~amplitude_to_db(spectrogram, [reference], [min_value], [db_range]) ⇒ <code> T </code></span></h2> <p data-svelte-h="svelte-93lmwr">Converts an amplitude spectrogram to the decibel scale. This computes <code>20 * log10(spectrogram / reference)</code>, | |
| using basic logarithm properties for numerical stability. NOTE: Operates in-place.</p> <p data-svelte-h="svelte-12zo44d">The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a | |
| linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. | |
| This means that large variations in energy may not sound all that different if the sound is loud to begin with. | |
| This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.</p> <p data-svelte-h="svelte-1d363v6"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>T</code> - The modified spectrogram in decibels.</p> <table data-svelte-h="svelte-1wpe278"><thead><tr><th>Param</th><th>Type</th><th>Default</th><th>Description</th></tr></thead> <tbody><tr><td>spectrogram</td><td><code>T</code></td><td></td><td><p>The input amplitude (mel) spectrogram.</p></td> </tr><tr><td>[reference]</td><td><code>number</code></td><td><code>1.0</code></td><td><p>Sets the input spectrogram value that corresponds to 0 dB. | |
| For example, use <code>np.max(spectrogram)</code> to set the loudest part to 0 dB. Must be greater than zero.</p></td> </tr><tr><td>[min_value]</td><td><code>number</code></td><td><code>1e-5</code></td><td><p>The spectrogram will be clipped to this minimum value before conversion to decibels, | |
| to avoid taking <code>log(0)</code>. The default of <code>1e-5</code> corresponds to a minimum of -100 dB. Must be greater than zero.</p></td> </tr><tr><td>[db_range]</td><td><code>number</code></td><td><code></code></td><td><p>Sets the maximum dynamic range in decibels. For example, if <code>db_range = 80</code>, the | |
| difference between the peak value and the smallest value will never be more than 80 dB. Must be greater than zero.</p></td></tr></tbody></table> <hr> <a id="module_utils/audio..power_to_db" class="group"></a> <h2 class="relative group"><a id="utilsaudiopowertodbspectrogram-reference-minvalue-dbrange--code-t-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#utilsaudiopowertodbspectrogram-reference-minvalue-dbrange--code-t-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>utils/audio~power_to_db(spectrogram, [reference], [min_value], [db_range]) ⇒ <code> T </code></span></h2> <p data-svelte-h="svelte-1dxatci">Converts a power spectrogram to the decibel scale. This computes <code>10 * log10(spectrogram / reference)</code>, | |
| using basic logarithm properties for numerical stability. NOTE: Operates in-place.</p> <p data-svelte-h="svelte-12zo44d">The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a | |
| linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. | |
| This means that large variations in energy may not sound all that different if the sound is loud to begin with. | |
| This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.</p> <p data-svelte-h="svelte-1bxqqp">Based on the implementation of <code>librosa.power_to_db</code>.</p> <p data-svelte-h="svelte-1d363v6"><strong>Kind</strong>: inner method of <a href="#module_utils/audio"><code>utils/audio</code></a><br> <strong>Returns</strong>: <code>T</code> - The modified spectrogram in decibels.</p> <table data-svelte-h="svelte-h9b3kb"><thead><tr><th>Param</th><th>Type</th><th>Default</th><th>Description</th></tr></thead> <tbody><tr><td>spectrogram</td><td><code>T</code></td><td></td><td><p>The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!</p></td> </tr><tr><td>[reference]</td><td><code>number</code></td><td><code>1.0</code></td><td><p>Sets the input spectrogram value that corresponds to 0 dB. | |
| For example, use <code>np.max(spectrogram)</code> to set the loudest part to 0 dB. Must be greater than zero.</p></td> </tr><tr><td>[min_value]</td><td><code>number</code></td><td><code>1e-10</code></td><td><p>The spectrogram will be clipped to this minimum value before conversion to decibels, | |
| to avoid taking <code>log(0)</code>. The default of <code>1e-10</code> corresponds to a minimum of -100 dB. Must be greater than zero.</p></td> </tr><tr><td>[db_range]</td><td><code>number</code></td><td><code></code></td><td><p>Sets the maximum dynamic range in decibels. For example, if <code>db_range = 80</code>, the | |
| difference between the peak value and the smallest value will never be more than 80 dB. Must be greater than zero.</p></td></tr></tbody></table> <hr> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers.js/blob/main/docs/source/api/utils/audio.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_kuyevp = { | |
| assets: "/docs/transformers.js/pr_1113/en", | |
| base: "/docs/transformers.js/pr_1113/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/transformers.js/pr_1113/en/_app/immutable/entry/start.88a6e140.js"), | |
| import("/docs/transformers.js/pr_1113/en/_app/immutable/entry/app.0003020d.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 16], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 58.2 kB
- Xet hash:
- b953d08f1abd10b1e936e1aee4370d1a8e47b017b7a339b216242a4adac4d946
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.