keet-streaming / src /lib /audio /mel-math.test.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* Unit tests for mel spectrogram computation functions.
*
* These tests verify that the mel math functions produce correct results
* and match the expected behavior of NeMo/parakeet.js mel processing.
*
* Run: npm test
*/
import { describe, it, expect } from 'vitest';
import {
MEL_CONSTANTS,
hzToMel,
melToHz,
createMelFilterbank,
createPaddedHannWindow,
precomputeTwiddles,
fft,
preemphasize,
computeMelFrame,
normalizeMelFeatures,
sampleToFrame,
} from './mel-math';
// ─── Constants ────────────────────────────────────────────────────────────
describe('MEL_CONSTANTS', () => {
it('should have correct NeMo-compatible values', () => {
expect(MEL_CONSTANTS.SAMPLE_RATE).toBe(16000);
expect(MEL_CONSTANTS.N_FFT).toBe(512);
expect(MEL_CONSTANTS.WIN_LENGTH).toBe(400);
expect(MEL_CONSTANTS.HOP_LENGTH).toBe(160);
expect(MEL_CONSTANTS.PREEMPH).toBe(0.97);
expect(MEL_CONSTANTS.N_FREQ_BINS).toBe(257);
expect(MEL_CONSTANTS.DEFAULT_N_MELS).toBe(128);
});
});
// ─── Mel Scale ────────────────────────────────────────────────────────────
describe('hzToMel / melToHz', () => {
it('should return 0 for 0 Hz', () => {
expect(hzToMel(0)).toBe(0);
});
it('should return mel in linear region for freq < 1000 Hz', () => {
// In linear region: mel = freq / (200/3) = freq * 3/200
const freq = 500;
const expected = freq / (200 / 3);
expect(hzToMel(freq)).toBeCloseTo(expected, 5);
});
it('should return mel in log region for freq >= 1000 Hz', () => {
// At 1000 Hz, mel = 1000 / (200/3) = 15.0
expect(hzToMel(1000)).toBeCloseTo(15.0, 5);
// Above 1000 Hz, should be in log region
expect(hzToMel(2000)).toBeGreaterThan(15.0);
});
it('should be invertible (roundtrip)', () => {
const freqs = [0, 100, 500, 1000, 2000, 4000, 8000];
for (const freq of freqs) {
const mel = hzToMel(freq);
const recovered = melToHz(mel);
expect(recovered).toBeCloseTo(freq, 3);
}
});
it('should be monotonically increasing', () => {
const freqs = [0, 100, 500, 1000, 2000, 4000, 8000];
const mels = freqs.map(hzToMel);
for (let i = 1; i < mels.length; i++) {
expect(mels[i]).toBeGreaterThan(mels[i - 1]);
}
});
});
// ─── Mel Filterbank ───────────────────────────────────────────────────────
describe('createMelFilterbank', () => {
it('should create filterbank with correct dimensions', () => {
const nMels = 128;
const fb = createMelFilterbank(nMels);
expect(fb).toBeInstanceOf(Float32Array);
expect(fb.length).toBe(nMels * MEL_CONSTANTS.N_FREQ_BINS);
});
it('should have non-negative values', () => {
const fb = createMelFilterbank(128);
for (let i = 0; i < fb.length; i++) {
expect(fb[i]).toBeGreaterThanOrEqual(0);
}
});
it('should have non-zero values in each mel bin', () => {
const nMels = 128;
const fb = createMelFilterbank(nMels);
for (let m = 0; m < nMels; m++) {
const offset = m * MEL_CONSTANTS.N_FREQ_BINS;
let sum = 0;
for (let k = 0; k < MEL_CONSTANTS.N_FREQ_BINS; k++) {
sum += fb[offset + k];
}
expect(sum).toBeGreaterThan(0);
}
});
it('should create triangular filters (each row is a triangle)', () => {
const nMels = 64;
const fb = createMelFilterbank(nMels);
// Check that each filter has a single peak region (no multiple peaks)
for (let m = 0; m < nMels; m++) {
const offset = m * MEL_CONSTANTS.N_FREQ_BINS;
// Find first and last non-zero
let firstNonZero = -1;
let lastNonZero = -1;
for (let k = 0; k < MEL_CONSTANTS.N_FREQ_BINS; k++) {
if (fb[offset + k] > 0) {
if (firstNonZero === -1) firstNonZero = k;
lastNonZero = k;
}
}
// Should have at least one non-zero bin
expect(firstNonZero).toBeGreaterThanOrEqual(0);
// All values between first and last should be > 0 (contiguous support)
for (let k = firstNonZero; k <= lastNonZero; k++) {
expect(fb[offset + k]).toBeGreaterThan(0);
}
}
});
it('should work for different nMels values', () => {
for (const nMels of [40, 64, 80, 128]) {
const fb = createMelFilterbank(nMels);
expect(fb.length).toBe(nMels * MEL_CONSTANTS.N_FREQ_BINS);
}
});
});
// ─── Hann Window ──────────────────────────────────────────────────────────
describe('createPaddedHannWindow', () => {
it('should return a Float64Array of length N_FFT', () => {
const win = createPaddedHannWindow();
expect(win).toBeInstanceOf(Float64Array);
expect(win.length).toBe(MEL_CONSTANTS.N_FFT);
});
it('should have zero padding at edges', () => {
const win = createPaddedHannWindow();
const padLeft = (MEL_CONSTANTS.N_FFT - MEL_CONSTANTS.WIN_LENGTH) >> 1; // 56
// Left padding should be zero
for (let i = 0; i < padLeft; i++) {
expect(win[i]).toBe(0);
}
// Right padding should be zero
const padRight = padLeft + MEL_CONSTANTS.WIN_LENGTH;
for (let i = padRight; i < MEL_CONSTANTS.N_FFT; i++) {
expect(win[i]).toBe(0);
}
});
it('should have symmetric Hann values in the active region', () => {
const win = createPaddedHannWindow();
const padLeft = (MEL_CONSTANTS.N_FFT - MEL_CONSTANTS.WIN_LENGTH) >> 1;
// Hann window should be symmetric
for (let i = 0; i < MEL_CONSTANTS.WIN_LENGTH; i++) {
const mirror = MEL_CONSTANTS.WIN_LENGTH - 1 - i;
expect(win[padLeft + i]).toBeCloseTo(win[padLeft + mirror], 10);
}
});
it('should peak at center with value ~1.0', () => {
const win = createPaddedHannWindow();
const padLeft = (MEL_CONSTANTS.N_FFT - MEL_CONSTANTS.WIN_LENGTH) >> 1;
const center = padLeft + Math.floor(MEL_CONSTANTS.WIN_LENGTH / 2);
// Center of Hann window should be close to 1.0
expect(win[center]).toBeCloseTo(1.0, 2);
});
});
// ─── FFT ──────────────────────────────────────────────────────────────────
describe('fft', () => {
it('should handle a DC signal', () => {
const n = 8;
const tw = precomputeTwiddles(n);
const re = new Float64Array([1, 1, 1, 1, 1, 1, 1, 1]);
const im = new Float64Array(n);
fft(re, im, n, tw);
// DC component (re[0]) should be n
expect(re[0]).toBeCloseTo(n, 5);
// All other components should be ~0
for (let i = 1; i < n; i++) {
expect(re[i]).toBeCloseTo(0, 5);
expect(im[i]).toBeCloseTo(0, 5);
}
});
it('should handle a single frequency signal', () => {
const n = 16;
const tw = precomputeTwiddles(n);
// Create a sinusoid at bin k=1: cos(2Ο€k/N * n) for n=0..N-1
const re = new Float64Array(n);
const im = new Float64Array(n);
for (let i = 0; i < n; i++) {
re[i] = Math.cos(2 * Math.PI * i / n);
}
fft(re, im, n, tw);
// Should have energy at bin 1 and bin N-1 (conjugate symmetry)
expect(Math.abs(re[1])).toBeCloseTo(n / 2, 3);
expect(Math.abs(re[n - 1])).toBeCloseTo(n / 2, 3);
// Other bins should be near zero
for (let i = 2; i < n - 1; i++) {
expect(Math.abs(re[i])).toBeLessThan(1e-6);
expect(Math.abs(im[i])).toBeLessThan(1e-6);
}
});
it('should handle 512-point FFT (actual size used)', () => {
const n = 512;
const tw = precomputeTwiddles(n);
// All zeros
const re = new Float64Array(n);
const im = new Float64Array(n);
fft(re, im, n, tw);
// All outputs should be zero
for (let i = 0; i < n; i++) {
expect(re[i]).toBeCloseTo(0, 10);
expect(im[i]).toBeCloseTo(0, 10);
}
});
it('should satisfy Parseval\'s theorem (energy conservation)', () => {
const n = 64;
const tw = precomputeTwiddles(n);
// Random-ish signal
const re = new Float64Array(n);
const im = new Float64Array(n);
for (let i = 0; i < n; i++) {
re[i] = Math.sin(i * 0.37) + Math.cos(i * 0.83);
}
// Time domain energy
let timeEnergy = 0;
for (let i = 0; i < n; i++) {
timeEnergy += re[i] * re[i] + im[i] * im[i];
}
fft(re, im, n, tw);
// Frequency domain energy
let freqEnergy = 0;
for (let i = 0; i < n; i++) {
freqEnergy += re[i] * re[i] + im[i] * im[i];
}
// Parseval: sum|x|^2 = (1/N) * sum|X|^2
expect(freqEnergy / n).toBeCloseTo(timeEnergy, 5);
});
});
// ─── Twiddle Factors ──────────────────────────────────────────────────────
describe('precomputeTwiddles', () => {
it('should produce cos and sin arrays of half the FFT size', () => {
const tw = precomputeTwiddles(512);
expect(tw.cos.length).toBe(256);
expect(tw.sin.length).toBe(256);
});
it('should start with cos[0]=1, sin[0]=0', () => {
const tw = precomputeTwiddles(512);
expect(tw.cos[0]).toBeCloseTo(1.0, 10);
expect(tw.sin[0]).toBeCloseTo(0.0, 10);
});
});
// ─── Pre-emphasis ─────────────────────────────────────────────────────────
describe('preemphasize', () => {
it('should apply pre-emphasis filter correctly', () => {
const chunk = new Float32Array([1.0, 2.0, 3.0, 4.0]);
const result = preemphasize(chunk, 0, 0.97);
// out[0] = 1.0 - 0.97 * 0 = 1.0
expect(result[0]).toBeCloseTo(1.0, 5);
// out[1] = 2.0 - 0.97 * 1.0 = 1.03
expect(result[1]).toBeCloseTo(1.03, 5);
// out[2] = 3.0 - 0.97 * 2.0 = 1.06
expect(result[2]).toBeCloseTo(1.06, 5);
// out[3] = 4.0 - 0.97 * 3.0 = 1.09
expect(result[3]).toBeCloseTo(1.09, 5);
});
it('should use lastSample for continuity across chunks', () => {
const chunk = new Float32Array([5.0, 6.0]);
const result = preemphasize(chunk, 4.0, 0.97);
// out[0] = 5.0 - 0.97 * 4.0 = 1.12
expect(result[0]).toBeCloseTo(1.12, 5);
// out[1] = 6.0 - 0.97 * 5.0 = 1.15
expect(result[1]).toBeCloseTo(1.15, 5);
});
it('should return zeros for constant signal', () => {
const chunk = new Float32Array([1.0, 1.0, 1.0, 1.0]);
const result = preemphasize(chunk, 1.0, 0.97);
// All should be 1 - 0.97 = 0.03
for (let i = 0; i < result.length; i++) {
expect(result[i]).toBeCloseTo(0.03, 5);
}
});
});
// ─── Mel Frame Computation ────────────────────────────────────────────────
describe('computeMelFrame', () => {
it('should produce correct number of mel bins', () => {
const nMels = 128;
const window = createPaddedHannWindow();
const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
const fb = createMelFilterbank(nMels);
// 1 second of silence
const audio = new Float32Array(16000);
const frame = computeMelFrame(audio, 0, window, tw, fb, nMels);
expect(frame).toBeInstanceOf(Float32Array);
expect(frame.length).toBe(nMels);
});
it('should produce finite values for silence', () => {
const nMels = 128;
const window = createPaddedHannWindow();
const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
const fb = createMelFilterbank(nMels);
const audio = new Float32Array(16000);
const frame = computeMelFrame(audio, 10, window, tw, fb, nMels);
for (let i = 0; i < nMels; i++) {
expect(isFinite(frame[i])).toBe(true);
}
});
it('should produce larger values for louder signal', () => {
const nMels = 128;
const window = createPaddedHannWindow();
const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
const fb = createMelFilterbank(nMels);
// Silence
const silence = new Float32Array(16000);
const silenceFrame = computeMelFrame(silence, 10, window, tw, fb, nMels);
// Loud sine wave
const loud = new Float32Array(16000);
for (let i = 0; i < 16000; i++) {
loud[i] = Math.sin(2 * Math.PI * 440 * i / 16000);
}
const preemph = preemphasize(loud);
const loudFrame = computeMelFrame(preemph, 10, window, tw, fb, nMels);
// At least some mel bins should be larger for the loud signal
let louderCount = 0;
for (let i = 0; i < nMels; i++) {
if (loudFrame[i] > silenceFrame[i]) louderCount++;
}
expect(louderCount).toBeGreaterThan(0);
});
});
// ─── Normalization ────────────────────────────────────────────────────────
describe('normalizeMelFeatures', () => {
it('should produce zero-mean per feature', () => {
const nMels = 4;
const T = 10;
const features = new Float32Array(nMels * T);
// Fill with some values
for (let m = 0; m < nMels; m++) {
for (let t = 0; t < T; t++) {
features[m * T + t] = m * 10 + t;
}
}
const normalized = normalizeMelFeatures(features, nMels, T);
// Each mel bin should have ~zero mean
for (let m = 0; m < nMels; m++) {
let sum = 0;
for (let t = 0; t < T; t++) {
sum += normalized[m * T + t];
}
expect(sum / T).toBeCloseTo(0, 4);
}
});
it('should produce unit variance per feature', () => {
const nMels = 4;
const T = 100;
const features = new Float32Array(nMels * T);
// Fill with varying values
for (let m = 0; m < nMels; m++) {
for (let t = 0; t < T; t++) {
features[m * T + t] = Math.sin(t * 0.1 + m);
}
}
const normalized = normalizeMelFeatures(features, nMels, T);
// Each mel bin should have ~unit Bessel-corrected std
for (let m = 0; m < nMels; m++) {
let sum = 0;
for (let t = 0; t < T; t++) {
sum += normalized[m * T + t];
}
const mean = sum / T;
let varSum = 0;
for (let t = 0; t < T; t++) {
const d = normalized[m * T + t] - mean;
varSum += d * d;
}
const std = Math.sqrt(varSum / (T - 1));
expect(std).toBeCloseTo(1.0, 1);
}
});
it('should handle single frame (T=1) gracefully', () => {
const nMels = 4;
const T = 1;
const features = new Float32Array([1, 2, 3, 4]);
const normalized = normalizeMelFeatures(features, nMels, T);
// With T=1, invStd=0, so all should be 0
for (let i = 0; i < normalized.length; i++) {
expect(normalized[i]).toBe(0);
}
});
it('should not modify the original array', () => {
const features = new Float32Array([1, 2, 3, 4, 5, 6, 7, 8]);
const copy = new Float32Array(features);
normalizeMelFeatures(features, 2, 4);
expect(features).toEqual(copy);
});
});
// ─── sampleToFrame ────────────────────────────────────────────────────────
describe('sampleToFrame', () => {
it('should convert 0 samples to frame 0', () => {
expect(sampleToFrame(0)).toBe(0);
});
it('should convert HOP_LENGTH samples to frame 1', () => {
expect(sampleToFrame(MEL_CONSTANTS.HOP_LENGTH)).toBe(1);
});
it('should convert 1 second (16000 samples) to 100 frames', () => {
expect(sampleToFrame(16000)).toBe(100);
});
it('should floor partial frames', () => {
expect(sampleToFrame(MEL_CONSTANTS.HOP_LENGTH - 1)).toBe(0);
expect(sampleToFrame(MEL_CONSTANTS.HOP_LENGTH + 1)).toBe(1);
});
});
// ─── End-to-End Mel Pipeline ──────────────────────────────────────────────
describe('End-to-End Mel Pipeline', () => {
it('should produce deterministic results for the same input', () => {
const nMels = 128;
const window = createPaddedHannWindow();
const tw = precomputeTwiddles(MEL_CONSTANTS.N_FFT);
const fb = createMelFilterbank(nMels);
// Create a repeatable signal
const audio = new Float32Array(4800); // 300ms
for (let i = 0; i < audio.length; i++) {
audio[i] = Math.sin(2 * Math.PI * 440 * i / 16000) * 0.5;
}
const preemph = preemphasize(audio);
const frame1 = computeMelFrame(preemph, 5, window, tw, fb, nMels);
const frame2 = computeMelFrame(preemph, 5, window, tw, fb, nMels);
for (let i = 0; i < nMels; i++) {
expect(frame1[i]).toBe(frame2[i]);
}
});
it('should produce correct number of frames for given audio length', () => {
// 1 second = 16000 samples β†’ 100 frames
expect(sampleToFrame(16000)).toBe(100);
// 5 seconds = 80000 samples β†’ 500 frames
expect(sampleToFrame(80000)).toBe(500);
// 7 seconds = 112000 samples β†’ 700 frames
expect(sampleToFrame(112000)).toBe(700);
});
});