Spaces:

rnnandi
/

convert-gemma3-to-onnx

Running

App Files Files Community

convert-gemma3-to-onnx / transformers.js /src /utils /audio.js

rnnandi's picture

Add all files to convert gemma3 model to onnx

ca97aa9 2 months ago

history blame contribute delete

34.7 kB

	/**
	* @file Helper module for audio processing.
	*
	* These functions and classes are only used internally,
	* meaning an end-user shouldn't need to access anything here.
	*
	* @module utils/audio
	*/

	import {
	getFile,
	} from './hub.js';
	import { FFT, max } from './maths.js';
	import {
	calculateReflectOffset, saveBlob,
	} from './core.js';
	import { apis } from '../env.js';
	import { Tensor, matmul } from './tensor.js';
	import fs from 'node:fs';

	/**
	* Helper function to read audio from a path/URL.
	* @param {string\|URL} url The path/URL to load the audio from.
	* @param {number} sampling_rate The sampling rate to use when decoding the audio.
	* @returns {Promise<Float32Array>} The decoded audio as a `Float32Array`.
	*/
	export async function read_audio(url, sampling_rate) {
	if (typeof AudioContext === 'undefined') {
	// Running in node or an environment without AudioContext
	throw Error(
	"Unable to load audio from path/URL since `AudioContext` is not available in your environment. " +
	"Instead, audio data should be passed directly to the pipeline/processor. " +
	"For more information and some example code, see https://huggingface.co/docs/transformers.js/guides/node-audio-processing."
	)
	}

	const response = await (await getFile(url)).arrayBuffer();
	const audioCTX = new AudioContext({ sampleRate: sampling_rate });
	if (typeof sampling_rate === 'undefined') {
	console.warn(`No sampling rate provided, using default of ${audioCTX.sampleRate}Hz.`)
	}
	const decoded = await audioCTX.decodeAudioData(response);

	/** @type {Float32Array} */
	let audio;

	// We now replicate HuggingFace's `ffmpeg_read` method:
	if (decoded.numberOfChannels === 2) {
	// When downmixing a stereo audio file to mono using the -ac 1 option in FFmpeg,
	// the audio signal is summed across both channels to create a single mono channel.
	// However, if the audio is at full scale (i.e. the highest possible volume level),
	// the summing of the two channels can cause the audio signal to clip or distort.

	// To prevent this clipping, FFmpeg applies a scaling factor of 1/sqrt(2) (~ 0.707)
	// to the audio signal before summing the two channels. This scaling factor ensures
	// that the combined audio signal will not exceed the maximum possible level, even
	// if both channels are at full scale.

	// After applying this scaling factor, the audio signal from both channels is summed
	// to create a single mono channel. It's worth noting that this scaling factor is
	// only applied when downmixing stereo audio to mono using the -ac 1 option in FFmpeg.
	// If you're using a different downmixing method, or if you're not downmixing the
	// audio at all, this scaling factor may not be needed.
	const SCALING_FACTOR = Math.sqrt(2);

	const left = decoded.getChannelData(0);
	const right = decoded.getChannelData(1);

	audio = new Float32Array(left.length);
	for (let i = 0; i < decoded.length; ++i) {
	audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2;
	}

	} else {
	// If the audio is not stereo, we can just use the first channel:
	audio = decoded.getChannelData(0);
	}

	return audio;
	}

	/**
	* Helper function to generate windows that are special cases of the generalized cosine window.
	* See https://www.mathworks.com/help/signal/ug/generalized-cosine-windows.html for more information.
	* @param {number} M Number of points in the output window. If zero or less, an empty array is returned.
	* @param {number} a_0 Offset for the generalized cosine window.
	* @returns {Float64Array} The generated window.
	*/
	function generalized_cosine_window(M, a_0) {
	if (M < 1) {
	return new Float64Array();
	}
	if (M === 1) {
	return new Float64Array([1]);
	}

	const a_1 = 1 - a_0;
	const factor = 2 * Math.PI / (M - 1);

	const cos_vals = new Float64Array(M);
	for (let i = 0; i < M; ++i) {
	cos_vals[i] = a_0 - a_1 * Math.cos(i * factor);
	}
	return cos_vals;
	}

	/**
	* Generates a Hanning window of length M.
	* See https://numpy.org/doc/stable/reference/generated/numpy.hanning.html for more information.
	*
	* @param {number} M The length of the Hanning window to generate.
	* @returns {Float64Array} The generated Hanning window.
	*/
	export function hanning(M) {
	return generalized_cosine_window(M, 0.5);
	}


	/**
	* Generates a Hamming window of length M.
	* See https://numpy.org/doc/stable/reference/generated/numpy.hamming.html for more information.
	*
	* @param {number} M The length of the Hamming window to generate.
	* @returns {Float64Array} The generated Hamming window.
	*/
	export function hamming(M) {
	return generalized_cosine_window(M, 0.54);
	}


	const HERTZ_TO_MEL_MAPPING = {
	"htk": (/** @type {number} / freq) => 2595.0 Math.log10(1.0 + (freq / 700.0)),
	"kaldi": (/** @type {number} / freq) => 1127.0 Math.log(1.0 + (freq / 700.0)),
	"slaney": (/** @type {number} */ freq, min_log_hertz = 1000.0, min_log_mel = 15.0, logstep = 27.0 / Math.log(6.4)) =>
	freq >= min_log_hertz
	? min_log_mel + Math.log(freq / min_log_hertz) * logstep
	: 3.0 * freq / 200.0,
	}

	/**
	* @template {Float32Array\|Float64Array\|number} T
	* @param {T} freq
	* @param {string} [mel_scale]
	* @returns {T}
	*/
	function hertz_to_mel(freq, mel_scale = "htk") {
	const fn = HERTZ_TO_MEL_MAPPING[mel_scale];
	if (!fn) {
	throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');
	}

	// @ts-expect-error ts(2322)
	return typeof freq === 'number' ? fn(freq) : freq.map(x => fn(x));
	}

	const MEL_TO_HERTZ_MAPPING = {
	"htk": (/** @type {number} / mels) => 700.0 (10.0 ** (mels / 2595.0) - 1.0),
	"kaldi": (/** @type {number} / mels) => 700.0 (Math.exp(mels / 1127.0) - 1.0),
	"slaney": (/** @type {number} */ mels, min_log_hertz = 1000.0, min_log_mel = 15.0, logstep = Math.log(6.4) / 27.0) => mels >= min_log_mel
	? min_log_hertz * Math.exp(logstep * (mels - min_log_mel))
	: 200.0 * mels / 3.0,
	}

	/**
	* @template {Float32Array\|Float64Array\|number} T
	* @param {T} mels
	* @param {string} [mel_scale]
	* @returns {T}
	*/
	function mel_to_hertz(mels, mel_scale = "htk") {
	const fn = MEL_TO_HERTZ_MAPPING[mel_scale];
	if (!fn) {
	throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');
	}

	// @ts-expect-error ts(2322)
	return typeof mels === 'number' ? fn(mels) : mels.map(x => fn(x));
	}

	/**
	* Creates a triangular filter bank.
	*
	* Adapted from torchaudio and librosa.
	*
	* @param {Float64Array} fft_freqs Discrete frequencies of the FFT bins in Hz, of shape `(num_frequency_bins,)`.
	* @param {Float64Array} filter_freqs Center frequencies of the triangular filters to create, in Hz, of shape `(num_mel_filters,)`.
	* @returns {number[][]} of shape `(num_frequency_bins, num_mel_filters)`.
	*/
	function _create_triangular_filter_bank(fft_freqs, filter_freqs) {
	const filter_diff = Float64Array.from(
	{ length: filter_freqs.length - 1 },
	(_, i) => filter_freqs[i + 1] - filter_freqs[i]
	);

	const slopes = Array.from({
	length: fft_freqs.length
	}, () => new Array(filter_freqs.length));

	for (let j = 0; j < fft_freqs.length; ++j) {
	const slope = slopes[j];
	for (let i = 0; i < filter_freqs.length; ++i) {
	slope[i] = filter_freqs[i] - fft_freqs[j];
	}
	}

	const numFreqs = filter_freqs.length - 2;
	const ret = Array.from({ length: numFreqs }, () => new Array(fft_freqs.length));

	for (let j = 0; j < fft_freqs.length; ++j) { // 201
	const slope = slopes[j];
	for (let i = 0; i < numFreqs; ++i) { // 80
	const down = -slope[i] / filter_diff[i];
	const up = slope[i + 2] / filter_diff[i + 1];
	ret[i][j] = Math.max(0, Math.min(down, up));
	}
	}
	return ret;
	}

	/**
	* Return evenly spaced numbers over a specified interval.
	* @param {number} start The starting value of the sequence.
	* @param {number} end The end value of the sequence.
	* @param {number} num Number of samples to generate.
	* @returns `num` evenly spaced samples, calculated over the interval `[start, stop]`.
	*/
	function linspace(start, end, num) {
	const step = (end - start) / (num - 1);
	return Float64Array.from({ length: num }, (_, i) => start + step * i);
	}

	/**
	* Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a mel filter bank, and
	* various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
	* are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
	* features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
	* @param {number} num_frequency_bins Number of frequency bins (should be the same as `n_fft // 2 + 1`
	* where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
	* @param {number} num_mel_filters Number of mel filters to generate.
	* @param {number} min_frequency Lowest frequency of interest in Hz.
	* @param {number} max_frequency Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
	* @param {number} sampling_rate Sample rate of the audio waveform.
	* @param {string} [norm] If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
	* @param {string} [mel_scale] The mel frequency scale to use, `"htk"` or `"slaney"`.
	* @param {boolean} [triangularize_in_mel_space] If this option is enabled, the triangular filter is applied in mel space rather than frequency space.
	* This should be set to `true` in order to get the same results as `torchaudio` when computing mel filters.
	* @returns {number[][]} Triangular filter bank matrix, which is a 2D array of shape (`num_frequency_bins`, `num_mel_filters`).
	* This is a projection matrix to go from a spectrogram to a mel spectrogram.
	*/
	export function mel_filter_bank(
	num_frequency_bins,
	num_mel_filters,
	min_frequency,
	max_frequency,
	sampling_rate,
	norm = null,
	mel_scale = "htk",
	triangularize_in_mel_space = false,
	) {
	if (norm !== null && norm !== "slaney") {
	throw new Error('norm must be one of null or "slaney"');
	}

	if (num_frequency_bins < 2) {
	throw new Error(`Require num_frequency_bins: ${num_frequency_bins} >= 2`);
	}

	if (min_frequency > max_frequency) {
	throw new Error(`Require min_frequency: ${min_frequency} <= max_frequency: ${max_frequency}`);
	}

	const mel_min = hertz_to_mel(min_frequency, mel_scale);
	const mel_max = hertz_to_mel(max_frequency, mel_scale);
	const mel_freqs = linspace(mel_min, mel_max, num_mel_filters + 2);

	let filter_freqs = mel_to_hertz(mel_freqs, mel_scale);
	let fft_freqs; // frequencies of FFT bins in Hz

	if (triangularize_in_mel_space) {
	const fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2);
	fft_freqs = hertz_to_mel(Float64Array.from({ length: num_frequency_bins }, (_, i) => i * fft_bin_width), mel_scale);
	filter_freqs = mel_freqs;
	} else {
	fft_freqs = linspace(0, Math.floor(sampling_rate / 2), num_frequency_bins);
	}

	const mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs);

	if (norm !== null && norm === "slaney") {
	// Slaney-style mel is scaled to be approx constant energy per channel
	for (let i = 0; i < num_mel_filters; ++i) {
	const filter = mel_filters[i];
	const enorm = 2.0 / (filter_freqs[i + 2] - filter_freqs[i]);
	for (let j = 0; j < num_frequency_bins; ++j) {
	// Apply this enorm to all frequency bins
	filter[j] *= enorm;
	}
	}
	}

	// TODO warn if there is a zero row

	return mel_filters;

	}

	/**
	* @template {Float32Array\|Float64Array} T
	* Pads an array with a reflected version of itself on both ends.
	* @param {T} array The array to pad.
	* @param {number} left The amount of padding to add to the left.
	* @param {number} right The amount of padding to add to the right.
	* @returns {T} The padded array.
	*/
	function padReflect(array, left, right) {
	// @ts-ignore
	const padded = new array.constructor(array.length + left + right);
	const w = array.length - 1;

	for (let i = 0; i < array.length; ++i) {
	padded[left + i] = array[i];
	}

	for (let i = 1; i <= left; ++i) {
	padded[left - i] = array[calculateReflectOffset(i, w)];
	}

	for (let i = 1; i <= right; ++i) {
	padded[w + left + i] = array[calculateReflectOffset(w - i, w)];
	}

	return padded;
	}

	/**
	* Helper function to compute `amplitude_to_db` and `power_to_db`.
	* @template {Float32Array\|Float64Array} T
	* @param {T} spectrogram
	* @param {number} factor
	* @param {number} reference
	* @param {number} min_value
	* @param {number} db_range
	* @returns {T}
	*/
	function _db_conversion_helper(spectrogram, factor, reference, min_value, db_range) {
	if (reference <= 0) {
	throw new Error('reference must be greater than zero');
	}

	if (min_value <= 0) {
	throw new Error('min_value must be greater than zero');
	}

	reference = Math.max(min_value, reference);

	const logReference = Math.log10(reference);
	for (let i = 0; i < spectrogram.length; ++i) {
	spectrogram[i] = factor * Math.log10(Math.max(min_value, spectrogram[i]) - logReference)
	}

	if (db_range !== null) {
	if (db_range <= 0) {
	throw new Error('db_range must be greater than zero');
	}
	const maxValue = max(spectrogram)[0] - db_range;
	for (let i = 0; i < spectrogram.length; ++i) {
	spectrogram[i] = Math.max(spectrogram[i], maxValue);
	}
	}

	return spectrogram;
	}

	/**
	* Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`,
	* using basic logarithm properties for numerical stability. NOTE: Operates in-place.
	*
	* The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
	* linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
	* This means that large variations in energy may not sound all that different if the sound is loud to begin with.
	* This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
	*
	* @template {Float32Array\|Float64Array} T
	* @param {T} spectrogram The input amplitude (mel) spectrogram.
	* @param {number} [reference=1.0] Sets the input spectrogram value that corresponds to 0 dB.
	* For example, use `np.max(spectrogram)` to set the loudest part to 0 dB. Must be greater than zero.
	* @param {number} [min_value=1e-5] The spectrogram will be clipped to this minimum value before conversion to decibels,
	* to avoid taking `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
	* @param {number} [db_range=null] Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the
	* difference between the peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
	* @returns {T} The modified spectrogram in decibels.
	*/
	function amplitude_to_db(spectrogram, reference = 1.0, min_value = 1e-5, db_range = null) {
	return _db_conversion_helper(spectrogram, 20.0, reference, min_value, db_range);
	}

	/**
	* Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`,
	* using basic logarithm properties for numerical stability. NOTE: Operates in-place.
	*
	* The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
	* linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
	* This means that large variations in energy may not sound all that different if the sound is loud to begin with.
	* This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
	*
	* Based on the implementation of `librosa.power_to_db`.
	*
	* @template {Float32Array\|Float64Array} T
	* @param {T} spectrogram The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
	* @param {number} [reference=1.0] Sets the input spectrogram value that corresponds to 0 dB.
	* For example, use `np.max(spectrogram)` to set the loudest part to 0 dB. Must be greater than zero.
	* @param {number} [min_value=1e-10] The spectrogram will be clipped to this minimum value before conversion to decibels,
	* to avoid taking `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
	* @param {number} [db_range=null] Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the
	* difference between the peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
	* @returns {T} The modified spectrogram in decibels.
	*/
	function power_to_db(spectrogram, reference = 1.0, min_value = 1e-10, db_range = null) {
	return _db_conversion_helper(spectrogram, 10.0, reference, min_value, db_range);
	}

	/**
	* Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
	*
	* This function can create the following kinds of spectrograms:
	* - amplitude spectrogram (`power = 1.0`)
	* - power spectrogram (`power = 2.0`)
	* - complex-valued spectrogram (`power = None`)
	* - log spectrogram (use `log_mel` argument)
	* - mel spectrogram (provide `mel_filters`)
	* - log-mel spectrogram (provide `mel_filters` and `log_mel`)
	*
	* In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame.
	* A padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
	* typically the next power of two.
	*
	* @param {Float32Array\|Float64Array} waveform The input waveform of shape `(length,)`. This must be a single real-valued, mono waveform.
	* @param {Float32Array\|Float64Array} window The windowing function to apply of shape `(frame_length,)`, including zero-padding if necessary. The actual window length may be
	* shorter than `frame_length`, but we're assuming the array has already been zero-padded.
	* @param {number} frame_length The length of the analysis frames in samples (a.k.a., `fft_length`).
	* @param {number} hop_length The stride between successive analysis frames in samples.
	* @param {Object} options
	* @param {number} [options.fft_length=null] The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
	* For optimal speed, this should be a power of two. If `null`, uses `frame_length`.
	* @param {number} [options.power=1.0] If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `null`, returns complex numbers.
	* @param {boolean} [options.center=true] Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `false`, frame
	* `t` will start at time `t * hop_length`.
	* @param {string} [options.pad_mode="reflect"] Padding mode used when `center` is `true`. Possible values are: `"constant"` (pad with zeros),
	* `"edge"` (pad with edge values), `"reflect"` (pads with mirrored values).
	* @param {boolean} [options.onesided=true] If `true`, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
	* frequency bins. If `false`, also computes the negative frequencies and returns `fft_length` frequency bins.
	* @param {number} [options.preemphasis=null] Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
	* @param {boolean} [options.preemphasis_htk_flavor=true] Whether to apply the pre-emphasis filter in the HTK flavor.
	* @param {number[][]} [options.mel_filters=null] The mel filter bank of shape `(num_freq_bins, num_mel_filters)`.
	* If supplied, applies this filter bank to create a mel spectrogram.
	* @param {number} [options.mel_floor=1e-10] Minimum value of mel frequency banks.
	* @param {string} [options.log_mel=null] How to convert the spectrogram to log scale. Possible options are:
	* `null` (don't convert), `"log"` (take the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels).
	* Can only be used when `power` is not `null`.
	* @param {number} [options.reference=1.0] Sets the input spectrogram value that corresponds to 0 dB. For example, use `max(spectrogram)[0]` to set
	* the loudest part to 0 dB. Must be greater than zero.
	* @param {number} [options.min_value=1e-10] The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking `log(0)`.
	* For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an amplitude spectrogram, the value `1e-5` corresponds to -100 dB.
	* Must be greater than zero.
	* @param {number} [options.db_range=null] Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
	* peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
	* @param {boolean} [options.remove_dc_offset=null] Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
	* order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
	* @param {number} [options.max_num_frames=null] If provided, limits the number of frames to compute to this value.
	* @param {number} [options.min_num_frames=null] If provided, ensures the number of frames to compute is at least this value.
	* @param {boolean} [options.do_pad=true] If `true`, pads the output spectrogram to have `max_num_frames` frames.
	* @param {boolean} [options.transpose=false] If `true`, the returned spectrogram will have shape `(num_frames, num_frequency_bins/num_mel_filters)`. If `false`, the returned spectrogram will have shape `(num_frequency_bins/num_mel_filters, num_frames)`.
	* @returns {Promise<Tensor>} Spectrogram of shape `(num_frequency_bins, length)` (regular spectrogram) or shape `(num_mel_filters, length)` (mel spectrogram).
	*/
	export async function spectrogram(
	waveform,
	window,
	frame_length,
	hop_length,
	{
	fft_length = null,
	power = 1.0,
	center = true,
	pad_mode = "reflect",
	onesided = true,
	preemphasis = null,
	preemphasis_htk_flavor = true,
	mel_filters = null,
	mel_floor = 1e-10,
	log_mel = null,
	reference = 1.0,
	min_value = 1e-10,
	db_range = null,
	remove_dc_offset = null,

	// Custom parameters for efficiency reasons
	min_num_frames = null,
	max_num_frames = null,
	do_pad = true,
	transpose = false,
	} = {}
	) {
	const window_length = window.length;
	if (fft_length === null) {
	fft_length = frame_length;
	}
	if (frame_length > fft_length) {
	throw Error(`frame_length (${frame_length}) may not be larger than fft_length (${fft_length})`)
	}

	if (window_length !== frame_length) {
	throw new Error(`Length of the window (${window_length}) must equal frame_length (${frame_length})`);
	}

	if (hop_length <= 0) {
	throw new Error("hop_length must be greater than zero");
	}

	if (power === null && mel_filters !== null) {
	throw new Error(
	"You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram. " +
	"Specify `power` to fix this issue."
	);
	}

	if (!preemphasis_htk_flavor) {
	throw new Error(
	"`preemphasis_htk_flavor=false` is not currently supported."
	);
	}

	if (center) {
	if (pad_mode !== 'reflect') {
	throw new Error(`pad_mode="${pad_mode}" not implemented yet.`)
	}
	const half_window = Math.floor((fft_length - 1) / 2) + 1;
	waveform = padReflect(waveform, half_window, half_window);
	}

	// split waveform into frames of frame_length size
	let num_frames = Math.floor(1 + Math.floor((waveform.length - frame_length) / hop_length))
	if (min_num_frames !== null && num_frames < min_num_frames) {
	num_frames = min_num_frames
	}
	const num_frequency_bins = onesided ? Math.floor(fft_length / 2) + 1 : fft_length

	let d1 = num_frames;
	let d1Max = num_frames;

	// If maximum number of frames is provided, we must either pad or truncate
	if (max_num_frames !== null) {
	if (max_num_frames > num_frames) { // input is too short, so we pad
	if (do_pad) {
	d1Max = max_num_frames;
	}
	} else { // input is too long, so we truncate
	d1Max = d1 = max_num_frames;
	}
	}

	// Preallocate arrays to store output.
	const fft = new FFT(fft_length);
	const inputBuffer = new Float64Array(fft_length);
	const outputBuffer = new Float64Array(fft.outputBufferSize);
	const transposedMagnitudeData = new Float32Array(num_frequency_bins * d1Max);

	for (let i = 0; i < d1; ++i) {
	// Populate buffer with waveform data
	const offset = i * hop_length;
	const buffer_size = Math.min(waveform.length - offset, frame_length);
	if (buffer_size !== frame_length) {
	// The full buffer is not needed, so we need to reset it (avoid overflow from previous iterations)
	// NOTE: We don't need to reset the buffer if it's full since we overwrite the first
	// `frame_length` values and the rest (`fft_length - frame_length`) remains zero.
	inputBuffer.fill(0, 0, frame_length);
	}

	for (let j = 0; j < buffer_size; ++j) {
	inputBuffer[j] = waveform[offset + j];
	}

	if (remove_dc_offset) {
	let sum = 0;
	for (let j = 0; j < buffer_size; ++j) {
	sum += inputBuffer[j];
	}
	const mean = sum / buffer_size;
	for (let j = 0; j < buffer_size; ++j) {
	inputBuffer[j] -= mean;
	}
	}

	if (preemphasis !== null) {
	// Done in reverse to avoid copies and destructive modification
	for (let j = buffer_size - 1; j >= 1; --j) {
	inputBuffer[j] -= preemphasis * inputBuffer[j - 1];
	}
	inputBuffer[0] *= 1 - preemphasis;
	}

	// Apply window function
	for (let j = 0; j < window.length; ++j) {
	inputBuffer[j] *= window[j];
	}

	fft.realTransform(outputBuffer, inputBuffer);

	// compute magnitudes
	for (let j = 0; j < num_frequency_bins; ++j) {
	const j2 = j << 1;

	// NOTE: We transpose the data here to avoid doing it later
	transposedMagnitudeData[j * d1Max + i] = outputBuffer[j2] 2 + outputBuffer[j2 + 1] 2;
	}
	}

	if (power !== null && power !== 2) {
	// slight optimization to not sqrt
	const pow = power / 2; // we use 2 since we already squared
	for (let i = 0; i < transposedMagnitudeData.length; ++i) {
	transposedMagnitudeData[i] **= pow;
	}
	}

	// TODO: What if `mel_filters` is null?
	const num_mel_filters = mel_filters.length;

	// Perform matrix muliplication:
	// mel_spec = mel_filters @ magnitudes.T
	// - mel_filters.shape=(80, 201)
	// - magnitudes.shape=(3000, 201) => magnitudes.T.shape=(201, 3000)
	// - mel_spec.shape=(80, 3000)
	let mel_spec = await matmul(
	// TODO: Make `mel_filters` a Tensor during initialization
	new Tensor('float32', mel_filters.flat(), [num_mel_filters, num_frequency_bins]),
	new Tensor('float32', transposedMagnitudeData, [num_frequency_bins, d1Max]),
	);
	if (transpose) {
	mel_spec = mel_spec.transpose(1, 0);
	}

	const mel_spec_data = /** @type {Float32Array} */(mel_spec.data);
	for (let i = 0; i < mel_spec_data.length; ++i) {
	mel_spec_data[i] = Math.max(mel_floor, mel_spec_data[i]);
	}

	if (power !== null && log_mel !== null) {
	const o = Math.min(mel_spec_data.length, d1 * num_mel_filters);
	// NOTE: operates in-place
	switch (log_mel) {
	case 'log':
	for (let i = 0; i < o; ++i) {
	mel_spec_data[i] = Math.log(mel_spec_data[i]);
	}
	break;
	case 'log10':
	for (let i = 0; i < o; ++i) {
	mel_spec_data[i] = Math.log10(mel_spec_data[i]);
	}
	break;
	case 'dB':
	if (power === 1.0) {
	amplitude_to_db(mel_spec_data, reference, min_value, db_range);
	} else if (power === 2.0) {
	power_to_db(mel_spec_data, reference, min_value, db_range);
	} else {
	throw new Error(`Cannot use log_mel option '${log_mel}' with power ${power}`)
	}
	break;
	default:
	throw new Error(`log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`);
	}
	}

	return mel_spec;
	}

	/**
	* Returns an array containing the specified window.
	* @param {number} window_length The length of the window in samples.
	* @param {string} name The name of the window function.
	* @param {Object} options Additional options.
	* @param {boolean} [options.periodic=true] Whether the window is periodic or symmetric.
	* @param {number} [options.frame_length=null] The length of the analysis frames in samples.
	* Provide a value for `frame_length` if the window is smaller than the frame length, so that it will be zero-padded.
	* @param {boolean} [options.center=true] Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
	* @returns {Float64Array} The window of shape `(window_length,)` or `(frame_length,)`.
	*/
	export function window_function(window_length, name, {
	periodic = true,
	frame_length = null,
	center = true,
	} = {}) {
	const length = periodic ? window_length + 1 : window_length;
	let window;
	switch (name) {
	case 'boxcar':
	window = new Float64Array(length).fill(1.0);
	break;
	case 'hann':
	case 'hann_window':
	window = hanning(length);
	break;
	case 'hamming':
	window = hamming(length);
	break;
	case 'povey':
	window = hanning(length).map(x => Math.pow(x, 0.85));
	break;
	default:
	throw new Error(`Unknown window type ${name}.`);
	}
	if (periodic) {
	window = window.subarray(0, window_length);
	}
	if (frame_length === null) {
	return window;
	}
	if (window_length > frame_length) {
	throw new Error(`Length of the window (${window_length}) may not be larger than frame_length (${frame_length})`);
	}

	return window;
	}

	/**
	* Encode audio data to a WAV file.
	* WAV file specs : https://en.wikipedia.org/wiki/WAV#WAV_File_header
	*
	* Adapted from https://www.npmjs.com/package/audiobuffer-to-wav
	* @param {Float32Array} samples The audio samples.
	* @param {number} rate The sample rate.
	* @returns {ArrayBuffer} The WAV audio buffer.
	*/
	function encodeWAV(samples, rate) {
	let offset = 44;
	const buffer = new ArrayBuffer(offset + samples.length * 4);
	const view = new DataView(buffer);

	/* RIFF identifier */
	writeString(view, 0, "RIFF");
	/* RIFF chunk length */
	view.setUint32(4, 36 + samples.length * 4, true);
	/* RIFF type */
	writeString(view, 8, "WAVE");
	/* format chunk identifier */
	writeString(view, 12, "fmt ");
	/* format chunk length */
	view.setUint32(16, 16, true);
	/* sample format (raw) */
	view.setUint16(20, 3, true);
	/* channel count */
	view.setUint16(22, 1, true);
	/* sample rate */
	view.setUint32(24, rate, true);
	/* byte rate (sample rate * block align) */
	view.setUint32(28, rate * 4, true);
	/* block align (channel count * bytes per sample) */
	view.setUint16(32, 4, true);
	/* bits per sample */
	view.setUint16(34, 32, true);
	/* data chunk identifier */
	writeString(view, 36, "data");
	/* data chunk length */
	view.setUint32(40, samples.length * 4, true);

	for (let i = 0; i < samples.length; ++i, offset += 4) {
	view.setFloat32(offset, samples[i], true);
	}

	return buffer;
	}

	function writeString(view, offset, string) {
	for (let i = 0; i < string.length; ++i) {
	view.setUint8(offset + i, string.charCodeAt(i));
	}
	}


	export class RawAudio {

	/**
	* Create a new `RawAudio` object.
	* @param {Float32Array} audio Audio data
	* @param {number} sampling_rate Sampling rate of the audio data
	*/
	constructor(audio, sampling_rate) {
	this.audio = audio
	this.sampling_rate = sampling_rate
	}

	/**
	* Convert the audio to a wav file buffer.
	* @returns {ArrayBuffer} The WAV file.
	*/
	toWav() {
	return encodeWAV(this.audio, this.sampling_rate)
	}

	/**
	* Convert the audio to a blob.
	* @returns {Blob}
	*/
	toBlob() {
	const wav = this.toWav();
	const blob = new Blob([wav], { type: 'audio/wav' });
	return blob;
	}

	/**
	* Save the audio to a wav file.
	* @param {string} path
	*/
	async save(path) {
	let fn;

	if (apis.IS_BROWSER_ENV) {
	if (apis.IS_WEBWORKER_ENV) {
	throw new Error('Unable to save a file from a Web Worker.')
	}
	fn = saveBlob;
	} else if (apis.IS_FS_AVAILABLE) {
	fn = async (/** @type {string} / path, /* @type {Blob} */ blob) => {
	let buffer = await blob.arrayBuffer();
	fs.writeFileSync(path, Buffer.from(buffer));
	}
	} else {
	throw new Error('Unable to save because filesystem is disabled in this environment.')
	}

	await fn(path, this.toBlob())
	}
	}