/** * Audio Processing for LFM2-Audio ONNX Runtime Web * * Computes mel spectrograms from audio input for the audio encoder. * Matches the Python compute_mel_spectrogram_numpy implementation. */ // Default mel config (matches mel_config.json) const DEFAULT_MEL_CONFIG = { sample_rate: 16000, n_fft: 512, win_length: 400, hop_length: 160, n_mels: 128, fmin: 0, fmax: 8000, preemph: 0.97, log_zero_guard: 5.960464477539063e-08, normalize: 'per_feature', mel_norm: 'slaney', }; let melConfig = { ...DEFAULT_MEL_CONFIG }; let melFilterbank = null; /** * Load mel config from model path * @param {string} modelPath - Path to model directory */ export async function loadMelConfig(modelPath) { try { const response = await fetch(`${modelPath}/onnx/mel_config.json`, { mode: 'cors', credentials: 'omit', }); if (response.ok) { melConfig = await response.json(); console.log('Loaded mel config:', melConfig); } } catch (e) { console.warn('Could not load mel_config.json, using defaults'); } // Pre-compute mel filterbank melFilterbank = createMelFilterbank( melConfig.sample_rate, melConfig.n_fft, melConfig.n_mels, melConfig.fmin, melConfig.fmax ); } /** * Create mel filterbank matrix (simplified slaney normalization) * @param {number} sr - Sample rate * @param {number} nFft - FFT size * @param {number} nMels - Number of mel bands * @param {number} fmin - Minimum frequency * @param {number} fmax - Maximum frequency * @returns {Float32Array[]} - Mel filterbank [n_mels, n_fft/2+1] */ function createMelFilterbank(sr, nFft, nMels, fmin, fmax) { const nFreqs = Math.floor(nFft / 2) + 1; // Mel scale conversion functions const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700); const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1); // Create mel points const melMin = hzToMel(fmin); const melMax = hzToMel(fmax); const melPoints = new Float32Array(nMels + 2); for (let i = 0; i < nMels + 2; i++) { melPoints[i] = melMin + (melMax - melMin) * i / (nMels + 1); } // Convert back to Hz and then to FFT bins const hzPoints = melPoints.map(melToHz); const binPoints = hzPoints.map((hz) => Math.floor((nFft + 1) * hz / sr)); // Create filterbank const filterbank = []; for (let m = 0; m < nMels; m++) { const filter = new Float32Array(nFreqs); const start = binPoints[m]; const center = binPoints[m + 1]; const end = binPoints[m + 2]; // Rising edge for (let k = start; k < center; k++) { if (k < nFreqs) { filter[k] = (k - start) / (center - start); } } // Falling edge for (let k = center; k < end; k++) { if (k < nFreqs) { filter[k] = (end - k) / (end - center); } } // Slaney normalization const enorm = 2.0 / (hzPoints[m + 2] - hzPoints[m]); for (let k = 0; k < nFreqs; k++) { filter[k] *= enorm; } filterbank.push(filter); } return filterbank; } /** * Create Hann window * @param {number} length - Window length * @returns {Float32Array} - Hann window */ function createHannWindow(length) { const window = new Float32Array(length); for (let i = 0; i < length; i++) { window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / (length - 1))); } return window; } /** * Resample audio to target sample rate (simple linear interpolation) * @param {Float32Array} audio - Input audio * @param {number} srcSr - Source sample rate * @param {number} dstSr - Target sample rate * @returns {Float32Array} - Resampled audio */ function resampleAudio(audio, srcSr, dstSr) { if (srcSr === dstSr) return audio; const ratio = srcSr / dstSr; const newLength = Math.floor(audio.length / ratio); const resampled = new Float32Array(newLength); for (let i = 0; i < newLength; i++) { const srcIdx = i * ratio; const srcIdxFloor = Math.floor(srcIdx); const srcIdxCeil = Math.min(srcIdxFloor + 1, audio.length - 1); const frac = srcIdx - srcIdxFloor; resampled[i] = audio[srcIdxFloor] * (1 - frac) + audio[srcIdxCeil] * frac; } return resampled; } // === FFT Cache for Mel Spectrogram === let _fftCache = null; /** * Initialize radix-2 FFT for a given size (must be power of 2) */ function initFFT(n) { if (_fftCache && _fftCache.n === n) return _fftCache; // Precompute twiddle factors const twiddleRe = new Float32Array(n / 2); const twiddleIm = new Float32Array(n / 2); for (let i = 0; i < n / 2; i++) { const angle = -2 * Math.PI * i / n; twiddleRe[i] = Math.cos(angle); twiddleIm[i] = Math.sin(angle); } // Precompute bit-reversal permutation const bitrev = new Uint32Array(n); for (let i = 0; i < n; i++) { let j = 0; let x = i; for (let k = 1; k < n; k <<= 1) { j = (j << 1) | (x & 1); x >>= 1; } bitrev[i] = j; } // Reusable work arrays const workRe = new Float32Array(n); const workIm = new Float32Array(n); _fftCache = { n, twiddleRe, twiddleIm, bitrev, workRe, workIm }; return _fftCache; } /** * Compute Real FFT magnitude using radix-2 Cooley-Tukey * @param {Float32Array} frame - Input frame (length must be power of 2) * @returns {Float32Array} - Magnitude spectrum [n/2+1] */ function computeRfftMagnitude(frame) { const n = frame.length; const nFreqs = Math.floor(n / 2) + 1; const cache = initFFT(n); const { twiddleRe, twiddleIm, bitrev, workRe, workIm } = cache; // Copy input with bit-reversal permutation for (let i = 0; i < n; i++) { workRe[bitrev[i]] = frame[i]; workIm[bitrev[i]] = 0; } // Cooley-Tukey butterflies for (let len = 2; len <= n; len <<= 1) { const halfLen = len >> 1; const step = n / len; for (let i = 0; i < n; i += len) { for (let j = 0; j < halfLen; j++) { const twIdx = j * step; const wRe = twiddleRe[twIdx]; const wIm = twiddleIm[twIdx]; const u = i + j; const v = u + halfLen; const tRe = wRe * workRe[v] - wIm * workIm[v]; const tIm = wRe * workIm[v] + wIm * workRe[v]; workRe[v] = workRe[u] - tRe; workIm[v] = workIm[u] - tIm; workRe[u] += tRe; workIm[u] += tIm; } } } // Compute magnitude for positive frequencies const magnitude = new Float32Array(nFreqs); for (let k = 0; k < nFreqs; k++) { magnitude[k] = Math.sqrt(workRe[k] * workRe[k] + workIm[k] * workIm[k]); } return magnitude; } /** * Compute mel spectrogram from audio data * @param {Float32Array} audioData - Audio samples in [-1, 1] * @param {number} sampleRate - Audio sample rate * @returns {{melFeatures: Float32Array, numFrames: number}} - Mel features [time, n_mels] */ export function computeMelSpectrogram(audioData, sampleRate) { const { sample_rate: targetSr, n_fft: nFft, win_length: winLength, hop_length: hopLength, preemph, log_zero_guard: logZeroGuard, n_mels: nMels, } = melConfig; // Ensure filterbank is created if (!melFilterbank) { melFilterbank = createMelFilterbank(targetSr, nFft, nMels, melConfig.fmin, melConfig.fmax); } // 1. Resample to target sample rate let audio = resampleAudio(audioData, sampleRate, targetSr); // 2. Pre-emphasis filter: y[t] = x[t] - preemph * x[t-1] const audioPreemph = new Float32Array(audio.length); audioPreemph[0] = audio[0]; for (let i = 1; i < audio.length; i++) { audioPreemph[i] = audio[i] - preemph * audio[i - 1]; } // 3. Pad for center=True STFT const padAmount = Math.floor(nFft / 2); const audioPadded = new Float32Array(audio.length + 2 * padAmount); audioPadded.set(audioPreemph, padAmount); // 4. Frame the signal with windowing const numFrames = 1 + Math.floor((audioPadded.length - nFft) / hopLength); const nFreqs = Math.floor(nFft / 2) + 1; // Create window (centered in frame) const hannWindow = createHannWindow(winLength); const padLeft = Math.floor((nFft - winLength) / 2); const paddedWindow = new Float32Array(nFft); for (let i = 0; i < winLength; i++) { paddedWindow[padLeft + i] = hannWindow[i]; } // 5. Compute STFT magnitude and mel spectrogram const melFeatures = new Float32Array(numFrames * nMels); for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) { // Extract and window frame const start = frameIdx * hopLength; const frame = new Float32Array(nFft); for (let i = 0; i < nFft; i++) { frame[i] = audioPadded[start + i] * paddedWindow[i]; } // Compute magnitude spectrum const magnitude = computeRfftMagnitude(frame); // Apply mel filterbank for (let m = 0; m < nMels; m++) { let melVal = 0; for (let k = 0; k < nFreqs; k++) { melVal += melFilterbank[m][k] * magnitude[k] * magnitude[k]; // Power spectrum } // Log mel with guard melFeatures[frameIdx * nMels + m] = Math.log(Math.max(melVal, logZeroGuard)); } } // 6. Per-feature normalization (if enabled) if (melConfig.normalize === 'per_feature') { for (let m = 0; m < nMels; m++) { let mean = 0; let std = 0; for (let t = 0; t < numFrames; t++) { mean += melFeatures[t * nMels + m]; } mean /= numFrames; for (let t = 0; t < numFrames; t++) { const diff = melFeatures[t * nMels + m] - mean; std += diff * diff; } std = Math.sqrt(std / numFrames + 1e-5); for (let t = 0; t < numFrames; t++) { melFeatures[t * nMels + m] = (melFeatures[t * nMels + m] - mean) / std; } } } return { melFeatures, numFrames }; } /** * Load audio file and decode to Float32Array * @param {File|Blob} file - Audio file * @returns {Promise<{audioData: Float32Array, sampleRate: number}>} */ export async function loadAudioFile(file) { const arrayBuffer = await file.arrayBuffer(); const audioContext = new (window.AudioContext || window.webkitAudioContext)(); try { const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); // Get mono audio (average channels if stereo) let audioData; if (audioBuffer.numberOfChannels === 1) { audioData = audioBuffer.getChannelData(0); } else { const ch0 = audioBuffer.getChannelData(0); const ch1 = audioBuffer.getChannelData(1); audioData = new Float32Array(ch0.length); for (let i = 0; i < ch0.length; i++) { audioData[i] = (ch0[i] + ch1[i]) / 2; } } return { audioData: new Float32Array(audioData), // Copy to avoid detached buffer issues sampleRate: audioBuffer.sampleRate, }; } finally { audioContext.close(); } } /** * Record audio from microphone * @param {number} maxDurationMs - Maximum recording duration in ms * @returns {Promise<{audioData: Float32Array, sampleRate: number}>} */ export async function recordAudio(maxDurationMs = 30000) { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); const mediaRecorder = new MediaRecorder(stream); const chunks = []; return new Promise((resolve, reject) => { mediaRecorder.ondataavailable = (e) => chunks.push(e.data); mediaRecorder.onstop = async () => { stream.getTracks().forEach((track) => track.stop()); const blob = new Blob(chunks, { type: 'audio/webm' }); try { const result = await loadAudioFile(blob); resolve(result); } catch (e) { reject(e); } }; mediaRecorder.onerror = (e) => { stream.getTracks().forEach((track) => track.stop()); reject(e); }; mediaRecorder.start(); // Auto-stop after max duration setTimeout(() => { if (mediaRecorder.state === 'recording') { mediaRecorder.stop(); } }, maxDurationMs); }); } export { melConfig };