Spaces:
Running
Running
| /** | |
| * Keet - Mel Spectrogram Math | |
| * | |
| * Pure computation functions for mel spectrogram feature extraction. | |
| * Matches NeMo / onnx-asr / parakeet.js mel.js exactly. | |
| * | |
| * Designed to be self-contained and reusable: | |
| * - No external dependencies | |
| * - All functions are pure (no side effects) | |
| * - Can be imported by workers, tests, or bundled as a standalone package | |
| */ | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // Constants | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| export const MEL_CONSTANTS = { | |
| SAMPLE_RATE: 16000, | |
| N_FFT: 512, | |
| WIN_LENGTH: 400, | |
| HOP_LENGTH: 160, | |
| PREEMPH: 0.97, | |
| LOG_ZERO_GUARD: 2 ** -24, // float(2**-24) β 5.96e-8 | |
| N_FREQ_BINS: (512 >> 1) + 1, // 257 | |
| DEFAULT_N_MELS: 128, | |
| } as const; | |
| // Slaney Mel Scale constants | |
| const F_SP = 200.0 / 3; // ~66.667 Hz spacing in linear region | |
| const MIN_LOG_HZ = 1000.0; | |
| const MIN_LOG_MEL = MIN_LOG_HZ / F_SP; // = 15.0 | |
| const LOG_STEP = Math.log(6.4) / 27.0; | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // Mel Scale Helpers | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| /** | |
| * Convert frequency in Hz to mel scale (Slaney variant). | |
| */ | |
| export function hzToMel(freq: number): number { | |
| return freq >= MIN_LOG_HZ | |
| ? MIN_LOG_MEL + Math.log(freq / MIN_LOG_HZ) / LOG_STEP | |
| : freq / F_SP; | |
| } | |
| /** | |
| * Convert mel scale value back to Hz (Slaney variant). | |
| */ | |
| export function melToHz(mel: number): number { | |
| return mel >= MIN_LOG_MEL | |
| ? MIN_LOG_HZ * Math.exp(LOG_STEP * (mel - MIN_LOG_MEL)) | |
| : mel * F_SP; | |
| } | |
| /** | |
| * Create mel filterbank matrix [nMels Γ N_FREQ_BINS] with Slaney normalization. | |
| * Returns a flat Float32Array in row-major order. | |
| */ | |
| export function createMelFilterbank(nMels: number): Float32Array { | |
| const { SAMPLE_RATE, N_FREQ_BINS } = MEL_CONSTANTS; | |
| const fMax = SAMPLE_RATE / 2; // 8000 | |
| const allFreqs = new Float64Array(N_FREQ_BINS); | |
| for (let i = 0; i < N_FREQ_BINS; i++) { | |
| allFreqs[i] = (fMax * i) / (N_FREQ_BINS - 1); | |
| } | |
| const melMin = hzToMel(0); | |
| const melMax = hzToMel(fMax); | |
| const nPoints = nMels + 2; | |
| const fPts = new Float64Array(nPoints); | |
| for (let i = 0; i < nPoints; i++) { | |
| fPts[i] = melToHz(melMin + ((melMax - melMin) * i) / (nPoints - 1)); | |
| } | |
| const fDiff = new Float64Array(nPoints - 1); | |
| for (let i = 0; i < nPoints - 1; i++) { | |
| fDiff[i] = fPts[i + 1] - fPts[i]; | |
| } | |
| const fb = new Float32Array(nMels * N_FREQ_BINS); | |
| for (let m = 0; m < nMels; m++) { | |
| const enorm = 2.0 / (fPts[m + 2] - fPts[m]); // slaney normalization | |
| const fbOffset = m * N_FREQ_BINS; | |
| for (let k = 0; k < N_FREQ_BINS; k++) { | |
| const downSlope = (allFreqs[k] - fPts[m]) / fDiff[m]; | |
| const upSlope = (fPts[m + 2] - allFreqs[k]) / fDiff[m + 1]; | |
| fb[fbOffset + k] = Math.max(0, Math.min(downSlope, upSlope)) * enorm; | |
| } | |
| } | |
| return fb; | |
| } | |
| /** | |
| * Create a Hann window of length WIN_LENGTH, zero-padded to N_FFT. | |
| */ | |
| export function createPaddedHannWindow(): Float64Array { | |
| const { N_FFT, WIN_LENGTH } = MEL_CONSTANTS; | |
| const window = new Float64Array(N_FFT); | |
| const padLeft = (N_FFT - WIN_LENGTH) >> 1; // 56 | |
| for (let n = 0; n < WIN_LENGTH; n++) { | |
| window[padLeft + n] = 0.5 * (1 - Math.cos((2 * Math.PI * n) / (WIN_LENGTH - 1))); | |
| } | |
| return window; | |
| } | |
| /** | |
| * Precompute FFT twiddle factors for a given size N. | |
| */ | |
| export function precomputeTwiddles(N: number): { cos: Float64Array; sin: Float64Array } { | |
| const half = N >> 1; | |
| const cos = new Float64Array(half); | |
| const sin = new Float64Array(half); | |
| for (let i = 0; i < half; i++) { | |
| const angle = (-2 * Math.PI * i) / N; | |
| cos[i] = Math.cos(angle); | |
| sin[i] = Math.sin(angle); | |
| } | |
| return { cos, sin }; | |
| } | |
| /** | |
| * In-place radix-2 Cooley-Tukey FFT. | |
| * @param re Real part (modified in-place) | |
| * @param im Imaginary part (modified in-place) | |
| * @param n FFT size (must be power of 2) | |
| * @param tw Precomputed twiddle factors | |
| */ | |
| export function fft(re: Float64Array, im: Float64Array, n: number, tw: { cos: Float64Array; sin: Float64Array }): void { | |
| // Bit-reversal permutation | |
| for (let i = 1, j = 0; i < n; i++) { | |
| let bit = n >> 1; | |
| while (j & bit) { j ^= bit; bit >>= 1; } | |
| j ^= bit; | |
| if (i < j) { | |
| let tmp = re[i]; re[i] = re[j]; re[j] = tmp; | |
| tmp = im[i]; im[i] = im[j]; im[j] = tmp; | |
| } | |
| } | |
| // Cooley-Tukey butterfly | |
| for (let size = 2; size <= n; size <<= 1) { | |
| const half = size >> 1; | |
| const step = n / size; | |
| for (let i = 0; i < n; i += size) { | |
| for (let j = 0; j < half; j++) { | |
| const idx = j * step; | |
| const tRe = re[i + j + half] * tw.cos[idx] - im[i + j + half] * tw.sin[idx]; | |
| const tIm = re[i + j + half] * tw.sin[idx] + im[i + j + half] * tw.cos[idx]; | |
| re[i + j + half] = re[i + j] - tRe; | |
| im[i + j + half] = im[i + j] - tIm; | |
| re[i + j] += tRe; | |
| im[i + j] += tIm; | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * Apply pre-emphasis filter to audio samples. | |
| * @param chunk Raw audio chunk | |
| * @param lastSample Last sample from previous chunk (for continuity) | |
| * @param coeff Pre-emphasis coefficient (default 0.97) | |
| * @returns Pre-emphasized samples | |
| */ | |
| export function preemphasize(chunk: Float32Array, lastSample: number = 0, coeff: number = MEL_CONSTANTS.PREEMPH): Float32Array { | |
| const out = new Float32Array(chunk.length); | |
| out[0] = chunk[0] - coeff * lastSample; | |
| for (let i = 1; i < chunk.length; i++) { | |
| out[i] = chunk[i] - coeff * chunk[i - 1]; | |
| } | |
| return out; | |
| } | |
| /** | |
| * Compute a single mel spectrogram frame from pre-emphasized audio. | |
| * @param preemphAudio Full pre-emphasized audio buffer | |
| * @param frameIdx Frame index | |
| * @param hannWindow Pre-computed Hann window | |
| * @param twiddles Pre-computed FFT twiddle factors | |
| * @param melFilterbank Pre-computed mel filterbank | |
| * @param nMels Number of mel bins | |
| * @returns Raw (un-normalized) log-mel values for this frame | |
| */ | |
| export function computeMelFrame( | |
| preemphAudio: Float32Array, | |
| frameIdx: number, | |
| hannWindow: Float64Array, | |
| twiddles: { cos: Float64Array; sin: Float64Array }, | |
| melFilterbank: Float32Array, | |
| nMels: number, | |
| ): Float32Array { | |
| const { N_FFT, HOP_LENGTH, N_FREQ_BINS, LOG_ZERO_GUARD } = MEL_CONSTANTS; | |
| const pad = N_FFT >> 1; // 256 | |
| const frameStart = frameIdx * HOP_LENGTH - pad; | |
| const preemphLen = preemphAudio.length; | |
| // Window the frame | |
| const fftRe = new Float64Array(N_FFT); | |
| const fftIm = new Float64Array(N_FFT); | |
| for (let k = 0; k < N_FFT; k++) { | |
| const idx = frameStart + k; | |
| const sample = (idx >= 0 && idx < preemphLen) ? preemphAudio[idx] : 0; | |
| fftRe[k] = sample * hannWindow[k]; | |
| fftIm[k] = 0; | |
| } | |
| // FFT | |
| fft(fftRe, fftIm, N_FFT, twiddles); | |
| // Power spectrum | |
| const power = new Float32Array(N_FREQ_BINS); | |
| for (let k = 0; k < N_FREQ_BINS; k++) { | |
| power[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k]; | |
| } | |
| // Mel filterbank multiply + log | |
| const melFrame = new Float32Array(nMels); | |
| for (let m = 0; m < nMels; m++) { | |
| let melVal = 0; | |
| const fbOff = m * N_FREQ_BINS; | |
| for (let k = 0; k < N_FREQ_BINS; k++) { | |
| melVal += power[k] * melFilterbank[fbOff + k]; | |
| } | |
| melFrame[m] = Math.log(melVal + LOG_ZERO_GUARD); | |
| } | |
| return melFrame; | |
| } | |
| /** | |
| * Normalize mel features per-feature with Bessel-corrected variance. | |
| * @param features Flat array [nMels Γ T], mel-major layout | |
| * @param nMels Number of mel bins | |
| * @param T Number of time frames | |
| * @returns Normalized features (new array) | |
| */ | |
| export function normalizeMelFeatures(features: Float32Array, nMels: number, T: number): Float32Array { | |
| const out = new Float32Array(features.length); | |
| for (let m = 0; m < nMels; m++) { | |
| const base = m * T; | |
| // Copy and compute mean | |
| let sum = 0; | |
| for (let t = 0; t < T; t++) { | |
| out[base + t] = features[base + t]; | |
| sum += features[base + t]; | |
| } | |
| const mean = sum / T; | |
| // Variance | |
| let varSum = 0; | |
| for (let t = 0; t < T; t++) { | |
| const d = out[base + t] - mean; | |
| varSum += d * d; | |
| } | |
| const invStd = T > 1 | |
| ? 1.0 / (Math.sqrt(varSum / (T - 1)) + 1e-5) | |
| : 0; | |
| // Normalize | |
| for (let t = 0; t < T; t++) { | |
| out[base + t] = (out[base + t] - mean) * invStd; | |
| } | |
| } | |
| return out; | |
| } | |
| /** | |
| * Convert sample offset to frame index. | |
| */ | |
| export function sampleToFrame(sampleOffset: number): number { | |
| return Math.floor(sampleOffset / MEL_CONSTANTS.HOP_LENGTH); | |
| } | |