Spaces:

ysdede
/

keet-streaming

Running

App Files Files Community

keet-streaming / src /lib /audio /mel-math.ts

ysdede

feat(space): migrate Hugging Face Space to keet SolidJS app

b8cc2bf 28 days ago

raw

history blame contribute delete

9.56 kB

	/**
	* Keet - Mel Spectrogram Math
	*
	* Pure computation functions for mel spectrogram feature extraction.
	* Matches NeMo / onnx-asr / parakeet.js mel.js exactly.
	*
	* Designed to be self-contained and reusable:
	* - No external dependencies
	* - All functions are pure (no side effects)
	* - Can be imported by workers, tests, or bundled as a standalone package
	*/

	// ═══════════════════════════════════════════════════════════════════════════
	// Constants
	// ═══════════════════════════════════════════════════════════════════════════

	export const MEL_CONSTANTS = {
	SAMPLE_RATE: 16000,
	N_FFT: 512,
	WIN_LENGTH: 400,
	HOP_LENGTH: 160,
	PREEMPH: 0.97,
	LOG_ZERO_GUARD: 2 -24, // float(2-24) ≈ 5.96e-8
	N_FREQ_BINS: (512 >> 1) + 1, // 257
	DEFAULT_N_MELS: 128,
	} as const;

	// Slaney Mel Scale constants
	const F_SP = 200.0 / 3; // ~66.667 Hz spacing in linear region
	const MIN_LOG_HZ = 1000.0;
	const MIN_LOG_MEL = MIN_LOG_HZ / F_SP; // = 15.0
	const LOG_STEP = Math.log(6.4) / 27.0;

	// ═══════════════════════════════════════════════════════════════════════════
	// Mel Scale Helpers
	// ═══════════════════════════════════════════════════════════════════════════

	/**
	* Convert frequency in Hz to mel scale (Slaney variant).
	*/
	export function hzToMel(freq: number): number {
	return freq >= MIN_LOG_HZ
	? MIN_LOG_MEL + Math.log(freq / MIN_LOG_HZ) / LOG_STEP
	: freq / F_SP;
	}

	/**
	* Convert mel scale value back to Hz (Slaney variant).
	*/
	export function melToHz(mel: number): number {
	return mel >= MIN_LOG_MEL
	? MIN_LOG_HZ * Math.exp(LOG_STEP * (mel - MIN_LOG_MEL))
	: mel * F_SP;
	}

	/**
	* Create mel filterbank matrix [nMels × N_FREQ_BINS] with Slaney normalization.
	* Returns a flat Float32Array in row-major order.
	*/
	export function createMelFilterbank(nMels: number): Float32Array {
	const { SAMPLE_RATE, N_FREQ_BINS } = MEL_CONSTANTS;
	const fMax = SAMPLE_RATE / 2; // 8000

	const allFreqs = new Float64Array(N_FREQ_BINS);
	for (let i = 0; i < N_FREQ_BINS; i++) {
	allFreqs[i] = (fMax * i) / (N_FREQ_BINS - 1);
	}

	const melMin = hzToMel(0);
	const melMax = hzToMel(fMax);
	const nPoints = nMels + 2;
	const fPts = new Float64Array(nPoints);
	for (let i = 0; i < nPoints; i++) {
	fPts[i] = melToHz(melMin + ((melMax - melMin) * i) / (nPoints - 1));
	}

	const fDiff = new Float64Array(nPoints - 1);
	for (let i = 0; i < nPoints - 1; i++) {
	fDiff[i] = fPts[i + 1] - fPts[i];
	}

	const fb = new Float32Array(nMels * N_FREQ_BINS);
	for (let m = 0; m < nMels; m++) {
	const enorm = 2.0 / (fPts[m + 2] - fPts[m]); // slaney normalization
	const fbOffset = m * N_FREQ_BINS;
	for (let k = 0; k < N_FREQ_BINS; k++) {
	const downSlope = (allFreqs[k] - fPts[m]) / fDiff[m];
	const upSlope = (fPts[m + 2] - allFreqs[k]) / fDiff[m + 1];
	fb[fbOffset + k] = Math.max(0, Math.min(downSlope, upSlope)) * enorm;
	}
	}
	return fb;
	}

	/**
	* Create a Hann window of length WIN_LENGTH, zero-padded to N_FFT.
	*/
	export function createPaddedHannWindow(): Float64Array {
	const { N_FFT, WIN_LENGTH } = MEL_CONSTANTS;
	const window = new Float64Array(N_FFT);
	const padLeft = (N_FFT - WIN_LENGTH) >> 1; // 56
	for (let n = 0; n < WIN_LENGTH; n++) {
	window[padLeft + n] = 0.5 * (1 - Math.cos((2 * Math.PI * n) / (WIN_LENGTH - 1)));
	}
	return window;
	}

	/**
	* Precompute FFT twiddle factors for a given size N.
	*/
	export function precomputeTwiddles(N: number): { cos: Float64Array; sin: Float64Array } {
	const half = N >> 1;
	const cos = new Float64Array(half);
	const sin = new Float64Array(half);
	for (let i = 0; i < half; i++) {
	const angle = (-2 * Math.PI * i) / N;
	cos[i] = Math.cos(angle);
	sin[i] = Math.sin(angle);
	}
	return { cos, sin };
	}

	/**
	* In-place radix-2 Cooley-Tukey FFT.
	* @param re Real part (modified in-place)
	* @param im Imaginary part (modified in-place)
	* @param n FFT size (must be power of 2)
	* @param tw Precomputed twiddle factors
	*/
	export function fft(re: Float64Array, im: Float64Array, n: number, tw: { cos: Float64Array; sin: Float64Array }): void {
	// Bit-reversal permutation
	for (let i = 1, j = 0; i < n; i++) {
	let bit = n >> 1;
	while (j & bit) { j ^= bit; bit >>= 1; }
	j ^= bit;
	if (i < j) {
	let tmp = re[i]; re[i] = re[j]; re[j] = tmp;
	tmp = im[i]; im[i] = im[j]; im[j] = tmp;
	}
	}
	// Cooley-Tukey butterfly
	for (let size = 2; size <= n; size <<= 1) {
	const half = size >> 1;
	const step = n / size;
	for (let i = 0; i < n; i += size) {
	for (let j = 0; j < half; j++) {
	const idx = j * step;
	const tRe = re[i + j + half] * tw.cos[idx] - im[i + j + half] * tw.sin[idx];
	const tIm = re[i + j + half] * tw.sin[idx] + im[i + j + half] * tw.cos[idx];
	re[i + j + half] = re[i + j] - tRe;
	im[i + j + half] = im[i + j] - tIm;
	re[i + j] += tRe;
	im[i + j] += tIm;
	}
	}
	}
	}

	/**
	* Apply pre-emphasis filter to audio samples.
	* @param chunk Raw audio chunk
	* @param lastSample Last sample from previous chunk (for continuity)
	* @param coeff Pre-emphasis coefficient (default 0.97)
	* @returns Pre-emphasized samples
	*/
	export function preemphasize(chunk: Float32Array, lastSample: number = 0, coeff: number = MEL_CONSTANTS.PREEMPH): Float32Array {
	const out = new Float32Array(chunk.length);
	out[0] = chunk[0] - coeff * lastSample;
	for (let i = 1; i < chunk.length; i++) {
	out[i] = chunk[i] - coeff * chunk[i - 1];
	}
	return out;
	}

	/**
	* Compute a single mel spectrogram frame from pre-emphasized audio.
	* @param preemphAudio Full pre-emphasized audio buffer
	* @param frameIdx Frame index
	* @param hannWindow Pre-computed Hann window
	* @param twiddles Pre-computed FFT twiddle factors
	* @param melFilterbank Pre-computed mel filterbank
	* @param nMels Number of mel bins
	* @returns Raw (un-normalized) log-mel values for this frame
	*/
	export function computeMelFrame(
	preemphAudio: Float32Array,
	frameIdx: number,
	hannWindow: Float64Array,
	twiddles: { cos: Float64Array; sin: Float64Array },
	melFilterbank: Float32Array,
	nMels: number,
	): Float32Array {
	const { N_FFT, HOP_LENGTH, N_FREQ_BINS, LOG_ZERO_GUARD } = MEL_CONSTANTS;
	const pad = N_FFT >> 1; // 256
	const frameStart = frameIdx * HOP_LENGTH - pad;
	const preemphLen = preemphAudio.length;

	// Window the frame
	const fftRe = new Float64Array(N_FFT);
	const fftIm = new Float64Array(N_FFT);
	for (let k = 0; k < N_FFT; k++) {
	const idx = frameStart + k;
	const sample = (idx >= 0 && idx < preemphLen) ? preemphAudio[idx] : 0;
	fftRe[k] = sample * hannWindow[k];
	fftIm[k] = 0;
	}

	// FFT
	fft(fftRe, fftIm, N_FFT, twiddles);

	// Power spectrum
	const power = new Float32Array(N_FREQ_BINS);
	for (let k = 0; k < N_FREQ_BINS; k++) {
	power[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k];
	}

	// Mel filterbank multiply + log
	const melFrame = new Float32Array(nMels);
	for (let m = 0; m < nMels; m++) {
	let melVal = 0;
	const fbOff = m * N_FREQ_BINS;
	for (let k = 0; k < N_FREQ_BINS; k++) {
	melVal += power[k] * melFilterbank[fbOff + k];
	}
	melFrame[m] = Math.log(melVal + LOG_ZERO_GUARD);
	}
	return melFrame;
	}

	/**
	* Normalize mel features per-feature with Bessel-corrected variance.
	* @param features Flat array [nMels × T], mel-major layout
	* @param nMels Number of mel bins
	* @param T Number of time frames
	* @returns Normalized features (new array)
	*/
	export function normalizeMelFeatures(features: Float32Array, nMels: number, T: number): Float32Array {
	const out = new Float32Array(features.length);

	for (let m = 0; m < nMels; m++) {
	const base = m * T;

	// Copy and compute mean
	let sum = 0;
	for (let t = 0; t < T; t++) {
	out[base + t] = features[base + t];
	sum += features[base + t];
	}
	const mean = sum / T;

	// Variance
	let varSum = 0;
	for (let t = 0; t < T; t++) {
	const d = out[base + t] - mean;
	varSum += d * d;
	}
	const invStd = T > 1
	? 1.0 / (Math.sqrt(varSum / (T - 1)) + 1e-5)
	: 0;

	// Normalize
	for (let t = 0; t < T; t++) {
	out[base + t] = (out[base + t] - mean) * invStd;
	}
	}

	return out;
	}

	/**
	* Convert sample offset to frame index.
	*/
	export function sampleToFrame(sampleOffset: number): number {
	return Math.floor(sampleOffset / MEL_CONSTANTS.HOP_LENGTH);
	}