Spaces:

ysdede
/

keet-streaming

Running

File size: 18,914 Bytes

b8cc2bf

/**
 * Unit tests for mel spectrogram computation functions.
 * 
 * These tests verify that the mel math functions produce correct results
 * and match the expected behavior of NeMo/parakeet.js mel processing.
 * 
 * Run: npm test
 */

import { describe, it, expect } from 'vitest';
import {
    MEL_CONSTANTS,
    hzToMel,
    melToHz,
    createMelFilterbank,
    createPaddedHannWindow,
    precomputeTwiddles,
    fft,
    preemphasize,
    computeMelFrame,
    normalizeMelFeatures,
    sampleToFrame,
} from './mel-math';

// ─── Constants ────────────────────────────────────────────────────────────

describe('MEL_CONSTANTS', () => {
    it('should have correct NeMo-compatible values', () => {
        expect(MEL_CONSTANTS.SAMPLE_RATE).toBe(16000);
        expect(MEL_CONSTANTS.N_FFT).toBe(512);
        expect(MEL_CONSTANTS.WIN_LENGTH).toBe(400);
        expect(MEL_CONSTANTS.HOP_LENGTH).toBe(160);
        expect(MEL_CONSTANTS.PREEMPH).toBe(0.97);
        expect(MEL_CONSTANTS.N_FREQ_BINS).toBe(257);
        expect(MEL_CONSTANTS.DEFAULT_N_MELS).toBe(128);
    });
});

// ─── Mel Scale ────────────────────────────────────────────────────────────

describe('hzToMel / melToHz', () => {
    it('should return 0 for 0 Hz', () => {
        expect(hzToMel(0)).toBe(0);
    });

    it('should return mel in linear region for freq < 1000 Hz', () => {
        // In linear region: mel = freq / (200/3) = freq * 3/200
        const freq = 500;
        const expected = freq / (200 / 3);
        expect(hzToMel(freq)).toBeCloseTo(expected, 5);
    });

    it('should return mel in log region for freq >= 1000 Hz', () => {
        // At 1000 Hz, mel = 1000 / (200/3) = 15.0
        expect(hzToMel(1000)).toBeCloseTo(15.0, 5);
        // Above 1000 Hz, should be in log region
        expect(hzToMel(2000)).toBeGreaterThan(15.0);
    });

    it('should be invertible (roundtrip)', () => {
        const freqs = [0, 100, 500, 1000, 2000, 4000, 8000];
        for (const freq of freqs) {
            const mel = hzToMel(freq);
            const recovered = melToHz(mel);
            expect(recovered).toBeCloseTo(freq, 3);
        }
    });

    it('should be monotonically increasing', () => {
        const freqs = [0, 100, 500, 1000, 2000, 4000, 8000];
        const mels = freqs.map(hzToMel);
        for (let i = 1; i < mels.length; i++) {
            expect(mels[i]).toBeGreaterThan(mels[i - 1]);
        }
    });
});

// ─── Mel Filterbank ───────────────────────────────────────────────────────

describe('createMelFilterbank', () => {
    it('should create filterbank with correct dimensions', () => {
        const nMels = 128;
        const fb = createMelFilterbank(nMels);
        expect(fb).toBeInstanceOf(Float32Array);
        expect(fb.length).toBe(nMels * MEL_CONSTANTS.N_FREQ_BINS);
    });

    it('should have non-negative values', () => {
        const fb = createMelFilterbank(128);
        for (let i = 0; i < fb.length; i++) {
            expect(fb[i]).toBeGreaterThanOrEqual(0);
        }
    });

    it('should have non-zero values in each mel bin', () => {
        const nMels = 128;
        const fb = createMelFilterbank(nMels);
        for (let m = 0; m < nMels; m++) {
            const offset = m * MEL_CONSTANTS.N_FREQ_BINS;
            let sum = 0;
            for (let k = 0; k < MEL_CONSTANTS.N_FREQ_BINS; k++) {
                sum += fb[offset + k];
            }
            expect(sum).toBeGreaterThan(0);
        }
    });

    it('should create triangular filters (each row is a triangle)', () => {
        const nMels = 64;
        const fb = createMelFilterbank(nMels);
        // Check that each filter has a single peak region (no multiple peaks)
        for (let m = 0; m < nMels; m++) {
            const offset = m * MEL_CONSTANTS.N_FREQ_BINS;
            // Find first and last non-zero
            let firstNonZero = -1;
            let lastNonZero = -1;
            for (let k = 0; k < MEL_CONSTANTS.N_FREQ_BINS; k++) {
                if (fb[offset + k] > 0) {
                    if (firstNonZero === -1) firstNonZero = k;
                    lastNonZero = k;
                }
            }
            // Should have at least one non-zero bin
            expect(firstNonZero).toBeGreaterThanOrEqual(0);
            // All values between first and last should be > 0 (contiguous support)
            for (let k = firstNonZero; k <= lastNonZero; k++) {
                expect(fb[offset + k]).toBeGreaterThan(0);
            }
        }
    });

    it('should work for different nMels values', () => {
        for (const nMels of [40, 64, 80, 128]) {
            const fb = createMelFilterbank(nMels);
            expect(fb.length).toBe(nMels * MEL_CONSTANTS.N_FREQ_BINS);
        }
    });
});

// ─── Hann Window ──────────────────────────────────────────────────────────

describe('createPaddedHannWindow', () => {
    it('should return a Float64Array of length N_FFT', () => {
        const win = createPaddedHannWindow();
        expect(win).toBeInstanceOf(Float64Array);
        expect(win.length).toBe(MEL_CONSTANTS.N_FFT);
    });

    it('should have zero padding at edges', () => {
        const win = createPaddedHannWindow();
        const padLeft = (MEL_CONSTANTS.N_FFT - MEL_CONSTANTS.WIN_LENGTH) >> 1; // 56
        // Left padding should be zero
        for (let i = 0; i < padLeft; i++) {
            expect(win[i]).toBe(0);
        }
        // Right padding should be zero
        const padRight = padLeft + MEL_CONSTANTS.WIN_LENGTH;
        for (let i = padRight; i < MEL_CONSTANTS.N_FFT; i++) {
            expect(win[i]).toBe(0);
        }
    });

    it('should have symmetric Hann values in the active region', () => {
        const win = createPaddedHannWindow();
        const padLeft = (MEL_CONSTANTS.N_FFT - MEL_CONSTANTS.WIN_LENGTH) >> 1;
        // Hann window should be symmetric
        for (let i = 0; i < MEL_CONSTANTS.WIN_LENGTH; i++) {
            const mirror = MEL_CONSTANTS.WIN_LENGTH - 1 - i;
            expect(win[padLeft + i]).toBeCloseTo(win[padLeft + mirror], 10);
        }
    });

    it('should peak at center with value ~1.0', () => {
        const win = createPaddedHannWindow();
        const padLeft = (MEL_CONSTANTS.N_FFT - MEL_CONSTANTS.WIN_LENGTH) >> 1;
        const center = padLeft + Math.floor(MEL_CONSTANTS.WIN_LENGTH / 2);
        // Center of Hann window should be close to 1.0
        expect(win[center]).toBeCloseTo(1.0, 2);
    });
});

// ─── FFT ──────────────────────────────────────────────────────────────────

describe('fft', () => {
    it('should handle a DC signal', () => {
        const n = 8;
        const tw = precomputeTwiddles(n);
        const re = new Float64Array([1, 1, 1, 1, 1, 1, 1, 1]);
        const im = new Float64Array(n);
        fft(re, im, n, tw);
        // DC component (re[0]) should be n
        expect(re[0]).toBeCloseTo(n, 5);
        // All other components should be ~0
        for (let i = 1; i < n; i++) {
            expect(re[i]).toBeCloseTo(0, 5);
            expect(im[i]).toBeCloseTo(0, 5);
        }
    });

    it('should handle a single frequency signal', () => {
        const n = 16;
        const tw = precomputeTwiddles(n);
        // Create a sinusoid at bin k=1: cos(2πk/N * n) for n=0..N-1
        const re = new Float64Array(n);
        const im = new Float64Array(n);
        for (let i = 0; i < n; i++) {
            re[i] = Math.cos(2 * Math.PI * i / n);
        }
        fft(re, im, n, tw);
        // Should have energy at bin 1 and bin N-1 (conjugate symmetry)
        expect(Math.abs(re[1])).toBeCloseTo(n / 2, 3);
        expect(Math.abs(re[n - 1])).toBeCloseTo(n / 2, 3);
        // Other bins should be near zero
        for (let i = 2; i < n - 1; i++) {
            expect(Math.abs(re[i])).toBeLessThan(1e-6);
            expect(Math.abs(im[i])).toBeLessThan(1e-6);
        }
    });

    it('should handle 512-point FFT (actual size used)', () => {
        const n = 512;
        const tw = precomputeTwiddles(n);
        // All zeros
        const re = new Float64Array(n);
        const im = new Float64Array(n);
        fft(re, im, n, tw);
        // All outputs should be zero
        for (let i = 0; i < n; i++) {
            expect(re[i]).toBeCloseTo(0, 10);
            expect(im[i]).toBeCloseTo(0, 10);
        }
    });

    it('should satisfy Parseval\'s theorem (energy conservation)', () => {
        const n = 64;
        const tw = precomputeTwiddles(n);
        // Random-ish signal
        const re = new Float64Array(n);
        const im = new Float64Array(n);
        for (let i = 0; i < n; i++) {
            re[i] = Math.sin(i * 0.37) + Math.cos(i * 0.83);
        }
        // Time domain energy
        let timeEnergy = 0;
        for (let i = 0; i < n; i++) {
            timeEnergy += re[i] * re[i] + im[i] * im[i];
        }

        fft(re, im, n, tw);

        // Frequency domain energy
        let freqEnergy = 0;
        for (let i = 0; i < n; i++) {
            freqEnergy += re[i] * re[i] + im[i] * im[i];
        }
        // Parseval: sum|x|^2 = (1/N) * sum|X|^2
        expect(freqEnergy / n).toBeCloseTo(timeEnergy, 5);
    });
});

// ─── Twiddle Factors ──────────────────────────────────────────────────────

describe('precomputeTwiddles', () => {
    it('should produce cos and sin arrays of half the FFT size', () => {
        const tw = precomputeTwiddles(512);
        expect(tw.cos.length).toBe(256);
        expect(tw.sin.length).toBe(256);
    });

    it('should start with cos[0]=1, sin[0]=0', () => {
        const tw = precomputeTwiddles(512);
        expect(tw.cos[0]).toBeCloseTo(1.0, 10);
        expect(tw.sin[0]).toBeCloseTo(0.0, 10);
    });
});

// ─── Pre-emphasis ─────────────────────────────────────────────────────────

describe('preemphasize', () => {
    it('should apply pre-emphasis filter correctly', () => {
        const chunk = new Float32Array([1.0, 2.0, 3.0, 4.0]);
        const result = preemphasize(chunk, 0, 0.97);
        // out[0] = 1.0 - 0.97 * 0 = 1.0
        expect(result[0]).toBeCloseTo(1.0, 5);
        // out[1] = 2.0 - 0.97 * 1.0 = 1.03
        expect(result[1]).toBeCloseTo(1.03, 5);
        // out[2] = 3.0 - 0.97 * 2.0 = 1.06
        expect(result[2]).toBeCloseTo(1.06, 5);
        // out[3] = 4.0 - 0.97 * 3.0 = 1.09
        expect(result[3]).toBeCloseTo(1.09, 5);
    });

    it('should use lastSample for continuity across chunks', () => {
        const chunk = new Float32Array([5.0, 6.0]);
        const result = preemphasize(chunk, 4.0, 0.97);
        // out[0] = 5.0 - 0.97 * 4.0 = 1.12
        expect(result[0]).toBeCloseTo(1.12, 5);
        // out[1] = 6.0 - 0.97 * 5.0 = 1.15
        expect(result[1]).toBeCloseTo(1.15, 5);
    });

    it('should return zeros for constant signal', () => {
        const chunk = new Float32Array([1.0, 1.0, 1.0, 1.0]);
        const result = preemphasize(chunk, 1.0, 0.97);
        // All should be 1 - 0.97 = 0.03
        for (let i = 0; i < result.length; i++) {
            expect(result[i]).toBeCloseTo(0.03, 5);
        }
    });
});

// ─── Mel Frame Computation ────────────────────────────────────────────────

describe('computeMelFrame', () => {
    it('should produce correct number of mel bins', () => {
        const nMels = 128;
        const window = createPaddedHannWindow();
        const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
        const fb = createMelFilterbank(nMels);

        // 1 second of silence
        const audio = new Float32Array(16000);
        const frame = computeMelFrame(audio, 0, window, tw, fb, nMels);

        expect(frame).toBeInstanceOf(Float32Array);
        expect(frame.length).toBe(nMels);
    });

    it('should produce finite values for silence', () => {
        const nMels = 128;
        const window = createPaddedHannWindow();
        const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
        const fb = createMelFilterbank(nMels);

        const audio = new Float32Array(16000);
        const frame = computeMelFrame(audio, 10, window, tw, fb, nMels);

        for (let i = 0; i < nMels; i++) {
            expect(isFinite(frame[i])).toBe(true);
        }
    });

    it('should produce larger values for louder signal', () => {
        const nMels = 128;
        const window = createPaddedHannWindow();
        const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
        const fb = createMelFilterbank(nMels);

        // Silence
        const silence = new Float32Array(16000);
        const silenceFrame = computeMelFrame(silence, 10, window, tw, fb, nMels);

        // Loud sine wave
        const loud = new Float32Array(16000);
        for (let i = 0; i < 16000; i++) {
            loud[i] = Math.sin(2 * Math.PI * 440 * i / 16000);
        }
        const preemph = preemphasize(loud);
        const loudFrame = computeMelFrame(preemph, 10, window, tw, fb, nMels);

        // At least some mel bins should be larger for the loud signal
        let louderCount = 0;
        for (let i = 0; i < nMels; i++) {
            if (loudFrame[i] > silenceFrame[i]) louderCount++;
        }
        expect(louderCount).toBeGreaterThan(0);
    });
});

// ─── Normalization ────────────────────────────────────────────────────────

describe('normalizeMelFeatures', () => {
    it('should produce zero-mean per feature', () => {
        const nMels = 4;
        const T = 10;
        const features = new Float32Array(nMels * T);
        // Fill with some values
        for (let m = 0; m < nMels; m++) {
            for (let t = 0; t < T; t++) {
                features[m * T + t] = m * 10 + t;
            }
        }

        const normalized = normalizeMelFeatures(features, nMels, T);

        // Each mel bin should have ~zero mean
        for (let m = 0; m < nMels; m++) {
            let sum = 0;
            for (let t = 0; t < T; t++) {
                sum += normalized[m * T + t];
            }
            expect(sum / T).toBeCloseTo(0, 4);
        }
    });

    it('should produce unit variance per feature', () => {
        const nMels = 4;
        const T = 100;
        const features = new Float32Array(nMels * T);
        // Fill with varying values
        for (let m = 0; m < nMels; m++) {
            for (let t = 0; t < T; t++) {
                features[m * T + t] = Math.sin(t * 0.1 + m);
            }
        }

        const normalized = normalizeMelFeatures(features, nMels, T);

        // Each mel bin should have ~unit Bessel-corrected std
        for (let m = 0; m < nMels; m++) {
            let sum = 0;
            for (let t = 0; t < T; t++) {
                sum += normalized[m * T + t];
            }
            const mean = sum / T;

            let varSum = 0;
            for (let t = 0; t < T; t++) {
                const d = normalized[m * T + t] - mean;
                varSum += d * d;
            }
            const std = Math.sqrt(varSum / (T - 1));
            expect(std).toBeCloseTo(1.0, 1);
        }
    });

    it('should handle single frame (T=1) gracefully', () => {
        const nMels = 4;
        const T = 1;
        const features = new Float32Array([1, 2, 3, 4]);

        const normalized = normalizeMelFeatures(features, nMels, T);
        // With T=1, invStd=0, so all should be 0
        for (let i = 0; i < normalized.length; i++) {
            expect(normalized[i]).toBe(0);
        }
    });

    it('should not modify the original array', () => {
        const features = new Float32Array([1, 2, 3, 4, 5, 6, 7, 8]);
        const copy = new Float32Array(features);
        normalizeMelFeatures(features, 2, 4);
        expect(features).toEqual(copy);
    });
});

// ─── sampleToFrame ────────────────────────────────────────────────────────

describe('sampleToFrame', () => {
    it('should convert 0 samples to frame 0', () => {
        expect(sampleToFrame(0)).toBe(0);
    });

    it('should convert HOP_LENGTH samples to frame 1', () => {
        expect(sampleToFrame(MEL_CONSTANTS.HOP_LENGTH)).toBe(1);
    });

    it('should convert 1 second (16000 samples) to 100 frames', () => {
        expect(sampleToFrame(16000)).toBe(100);
    });

    it('should floor partial frames', () => {
        expect(sampleToFrame(MEL_CONSTANTS.HOP_LENGTH - 1)).toBe(0);
        expect(sampleToFrame(MEL_CONSTANTS.HOP_LENGTH + 1)).toBe(1);
    });
});

// ─── End-to-End Mel Pipeline ──────────────────────────────────────────────

describe('End-to-End Mel Pipeline', () => {
    it('should produce deterministic results for the same input', () => {
        const nMels = 128;
        const window = createPaddedHannWindow();
        const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
        const fb = createMelFilterbank(nMels);

        // Create a repeatable signal
        const audio = new Float32Array(4800); // 300ms
        for (let i = 0; i < audio.length; i++) {
            audio[i] = Math.sin(2 * Math.PI * 440 * i / 16000) * 0.5;
        }
        const preemph = preemphasize(audio);

        const frame1 = computeMelFrame(preemph, 5, window, tw, fb, nMels);
        const frame2 = computeMelFrame(preemph, 5, window, tw, fb, nMels);

        for (let i = 0; i < nMels; i++) {
            expect(frame1[i]).toBe(frame2[i]);
        }
    });

    it('should produce correct number of frames for given audio length', () => {
        // 1 second = 16000 samples → 100 frames
        expect(sampleToFrame(16000)).toBe(100);
        // 5 seconds = 80000 samples → 500 frames
        expect(sampleToFrame(80000)).toBe(500);
        // 7 seconds = 112000 samples → 700 frames
        expect(sampleToFrame(112000)).toBe(700);
    });
});