Spaces:

ysdede
/

keet-streaming

Running

File size: 9,558 Bytes

b8cc2bf

/**
 * Keet - Mel Spectrogram Math
 * 
 * Pure computation functions for mel spectrogram feature extraction.
 * Matches NeMo / onnx-asr / parakeet.js mel.js exactly.
 * 
 * Designed to be self-contained and reusable:
 *   - No external dependencies
 *   - All functions are pure (no side effects)
 *   - Can be imported by workers, tests, or bundled as a standalone package
 */

// ═══════════════════════════════════════════════════════════════════════════
// Constants
// ═══════════════════════════════════════════════════════════════════════════

export const MEL_CONSTANTS = {
    SAMPLE_RATE: 16000,
    N_FFT: 512,
    WIN_LENGTH: 400,
    HOP_LENGTH: 160,
    PREEMPH: 0.97,
    LOG_ZERO_GUARD: 2 ** -24, // float(2**-24) ≈ 5.96e-8
    N_FREQ_BINS: (512 >> 1) + 1, // 257
    DEFAULT_N_MELS: 128,
} as const;

// Slaney Mel Scale constants
const F_SP = 200.0 / 3; // ~66.667 Hz spacing in linear region
const MIN_LOG_HZ = 1000.0;
const MIN_LOG_MEL = MIN_LOG_HZ / F_SP; // = 15.0
const LOG_STEP = Math.log(6.4) / 27.0;

// ═══════════════════════════════════════════════════════════════════════════
// Mel Scale Helpers
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Convert frequency in Hz to mel scale (Slaney variant).
 */
export function hzToMel(freq: number): number {
    return freq >= MIN_LOG_HZ
        ? MIN_LOG_MEL + Math.log(freq / MIN_LOG_HZ) / LOG_STEP
        : freq / F_SP;
}

/**
 * Convert mel scale value back to Hz (Slaney variant).
 */
export function melToHz(mel: number): number {
    return mel >= MIN_LOG_MEL
        ? MIN_LOG_HZ * Math.exp(LOG_STEP * (mel - MIN_LOG_MEL))
        : mel * F_SP;
}

/**
 * Create mel filterbank matrix [nMels × N_FREQ_BINS] with Slaney normalization.
 * Returns a flat Float32Array in row-major order.
 */
export function createMelFilterbank(nMels: number): Float32Array {
    const { SAMPLE_RATE, N_FREQ_BINS } = MEL_CONSTANTS;
    const fMax = SAMPLE_RATE / 2; // 8000

    const allFreqs = new Float64Array(N_FREQ_BINS);
    for (let i = 0; i < N_FREQ_BINS; i++) {
        allFreqs[i] = (fMax * i) / (N_FREQ_BINS - 1);
    }

    const melMin = hzToMel(0);
    const melMax = hzToMel(fMax);
    const nPoints = nMels + 2;
    const fPts = new Float64Array(nPoints);
    for (let i = 0; i < nPoints; i++) {
        fPts[i] = melToHz(melMin + ((melMax - melMin) * i) / (nPoints - 1));
    }

    const fDiff = new Float64Array(nPoints - 1);
    for (let i = 0; i < nPoints - 1; i++) {
        fDiff[i] = fPts[i + 1] - fPts[i];
    }

    const fb = new Float32Array(nMels * N_FREQ_BINS);
    for (let m = 0; m < nMels; m++) {
        const enorm = 2.0 / (fPts[m + 2] - fPts[m]); // slaney normalization
        const fbOffset = m * N_FREQ_BINS;
        for (let k = 0; k < N_FREQ_BINS; k++) {
            const downSlope = (allFreqs[k] - fPts[m]) / fDiff[m];
            const upSlope = (fPts[m + 2] - allFreqs[k]) / fDiff[m + 1];
            fb[fbOffset + k] = Math.max(0, Math.min(downSlope, upSlope)) * enorm;
        }
    }
    return fb;
}

/**
 * Create a Hann window of length WIN_LENGTH, zero-padded to N_FFT.
 */
export function createPaddedHannWindow(): Float64Array {
    const { N_FFT, WIN_LENGTH } = MEL_CONSTANTS;
    const window = new Float64Array(N_FFT);
    const padLeft = (N_FFT - WIN_LENGTH) >> 1; // 56
    for (let n = 0; n < WIN_LENGTH; n++) {
        window[padLeft + n] = 0.5 * (1 - Math.cos((2 * Math.PI * n) / (WIN_LENGTH - 1)));
    }
    return window;
}

/**
 * Precompute FFT twiddle factors for a given size N.
 */
export function precomputeTwiddles(N: number): { cos: Float64Array; sin: Float64Array } {
    const half = N >> 1;
    const cos = new Float64Array(half);
    const sin = new Float64Array(half);
    for (let i = 0; i < half; i++) {
        const angle = (-2 * Math.PI * i) / N;
        cos[i] = Math.cos(angle);
        sin[i] = Math.sin(angle);
    }
    return { cos, sin };
}

/**
 * In-place radix-2 Cooley-Tukey FFT.
 * @param re Real part (modified in-place)
 * @param im Imaginary part (modified in-place)
 * @param n FFT size (must be power of 2)
 * @param tw Precomputed twiddle factors
 */
export function fft(re: Float64Array, im: Float64Array, n: number, tw: { cos: Float64Array; sin: Float64Array }): void {
    // Bit-reversal permutation
    for (let i = 1, j = 0; i < n; i++) {
        let bit = n >> 1;
        while (j & bit) { j ^= bit; bit >>= 1; }
        j ^= bit;
        if (i < j) {
            let tmp = re[i]; re[i] = re[j]; re[j] = tmp;
            tmp = im[i]; im[i] = im[j]; im[j] = tmp;
        }
    }
    // Cooley-Tukey butterfly
    for (let size = 2; size <= n; size <<= 1) {
        const half = size >> 1;
        const step = n / size;
        for (let i = 0; i < n; i += size) {
            for (let j = 0; j < half; j++) {
                const idx = j * step;
                const tRe = re[i + j + half] * tw.cos[idx] - im[i + j + half] * tw.sin[idx];
                const tIm = re[i + j + half] * tw.sin[idx] + im[i + j + half] * tw.cos[idx];
                re[i + j + half] = re[i + j] - tRe;
                im[i + j + half] = im[i + j] - tIm;
                re[i + j] += tRe;
                im[i + j] += tIm;
            }
        }
    }
}

/**
 * Apply pre-emphasis filter to audio samples.
 * @param chunk Raw audio chunk
 * @param lastSample Last sample from previous chunk (for continuity)
 * @param coeff Pre-emphasis coefficient (default 0.97)
 * @returns Pre-emphasized samples
 */
export function preemphasize(chunk: Float32Array, lastSample: number = 0, coeff: number = MEL_CONSTANTS.PREEMPH): Float32Array {
    const out = new Float32Array(chunk.length);
    out[0] = chunk[0] - coeff * lastSample;
    for (let i = 1; i < chunk.length; i++) {
        out[i] = chunk[i] - coeff * chunk[i - 1];
    }
    return out;
}

/**
 * Compute a single mel spectrogram frame from pre-emphasized audio.
 * @param preemphAudio Full pre-emphasized audio buffer
 * @param frameIdx Frame index
 * @param hannWindow Pre-computed Hann window
 * @param twiddles Pre-computed FFT twiddle factors
 * @param melFilterbank Pre-computed mel filterbank
 * @param nMels Number of mel bins
 * @returns Raw (un-normalized) log-mel values for this frame
 */
export function computeMelFrame(
    preemphAudio: Float32Array,
    frameIdx: number,
    hannWindow: Float64Array,
    twiddles: { cos: Float64Array; sin: Float64Array },
    melFilterbank: Float32Array,
    nMels: number,
): Float32Array {
    const { N_FFT, HOP_LENGTH, N_FREQ_BINS, LOG_ZERO_GUARD } = MEL_CONSTANTS;
    const pad = N_FFT >> 1; // 256
    const frameStart = frameIdx * HOP_LENGTH - pad;
    const preemphLen = preemphAudio.length;

    // Window the frame
    const fftRe = new Float64Array(N_FFT);
    const fftIm = new Float64Array(N_FFT);
    for (let k = 0; k < N_FFT; k++) {
        const idx = frameStart + k;
        const sample = (idx >= 0 && idx < preemphLen) ? preemphAudio[idx] : 0;
        fftRe[k] = sample * hannWindow[k];
        fftIm[k] = 0;
    }

    // FFT
    fft(fftRe, fftIm, N_FFT, twiddles);

    // Power spectrum
    const power = new Float32Array(N_FREQ_BINS);
    for (let k = 0; k < N_FREQ_BINS; k++) {
        power[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k];
    }

    // Mel filterbank multiply + log
    const melFrame = new Float32Array(nMels);
    for (let m = 0; m < nMels; m++) {
        let melVal = 0;
        const fbOff = m * N_FREQ_BINS;
        for (let k = 0; k < N_FREQ_BINS; k++) {
            melVal += power[k] * melFilterbank[fbOff + k];
        }
        melFrame[m] = Math.log(melVal + LOG_ZERO_GUARD);
    }
    return melFrame;
}

/**
 * Normalize mel features per-feature with Bessel-corrected variance.
 * @param features Flat array [nMels × T], mel-major layout
 * @param nMels Number of mel bins
 * @param T Number of time frames
 * @returns Normalized features (new array)
 */
export function normalizeMelFeatures(features: Float32Array, nMels: number, T: number): Float32Array {
    const out = new Float32Array(features.length);

    for (let m = 0; m < nMels; m++) {
        const base = m * T;

        // Copy and compute mean
        let sum = 0;
        for (let t = 0; t < T; t++) {
            out[base + t] = features[base + t];
            sum += features[base + t];
        }
        const mean = sum / T;

        // Variance
        let varSum = 0;
        for (let t = 0; t < T; t++) {
            const d = out[base + t] - mean;
            varSum += d * d;
        }
        const invStd = T > 1
            ? 1.0 / (Math.sqrt(varSum / (T - 1)) + 1e-5)
            : 0;

        // Normalize
        for (let t = 0; t < T; t++) {
            out[base + t] = (out[base + t] - mean) * invStd;
        }
    }

    return out;
}

/**
 * Convert sample offset to frame index.
 */
export function sampleToFrame(sampleOffset: number): number {
    return Math.floor(sampleOffset / MEL_CONSTANTS.HOP_LENGTH);
}