ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* Keet - Mel Spectrogram Math
*
* Pure computation functions for mel spectrogram feature extraction.
* Matches NeMo / onnx-asr / parakeet.js mel.js exactly.
*
* Designed to be self-contained and reusable:
* - No external dependencies
* - All functions are pure (no side effects)
* - Can be imported by workers, tests, or bundled as a standalone package
*/
// ═══════════════════════════════════════════════════════════════════════════
// Constants
// ═══════════════════════════════════════════════════════════════════════════
export const MEL_CONSTANTS = {
SAMPLE_RATE: 16000,
N_FFT: 512,
WIN_LENGTH: 400,
HOP_LENGTH: 160,
PREEMPH: 0.97,
LOG_ZERO_GUARD: 2 ** -24, // float(2**-24) β‰ˆ 5.96e-8
N_FREQ_BINS: (512 >> 1) + 1, // 257
DEFAULT_N_MELS: 128,
} as const;
// Slaney Mel Scale constants
const F_SP = 200.0 / 3; // ~66.667 Hz spacing in linear region
const MIN_LOG_HZ = 1000.0;
const MIN_LOG_MEL = MIN_LOG_HZ / F_SP; // = 15.0
const LOG_STEP = Math.log(6.4) / 27.0;
// ═══════════════════════════════════════════════════════════════════════════
// Mel Scale Helpers
// ═══════════════════════════════════════════════════════════════════════════
/**
* Convert frequency in Hz to mel scale (Slaney variant).
*/
export function hzToMel(freq: number): number {
return freq >= MIN_LOG_HZ
? MIN_LOG_MEL + Math.log(freq / MIN_LOG_HZ) / LOG_STEP
: freq / F_SP;
}
/**
* Convert mel scale value back to Hz (Slaney variant).
*/
export function melToHz(mel: number): number {
return mel >= MIN_LOG_MEL
? MIN_LOG_HZ * Math.exp(LOG_STEP * (mel - MIN_LOG_MEL))
: mel * F_SP;
}
/**
* Create mel filterbank matrix [nMels Γ— N_FREQ_BINS] with Slaney normalization.
* Returns a flat Float32Array in row-major order.
*/
export function createMelFilterbank(nMels: number): Float32Array {
const { SAMPLE_RATE, N_FREQ_BINS } = MEL_CONSTANTS;
const fMax = SAMPLE_RATE / 2; // 8000
const allFreqs = new Float64Array(N_FREQ_BINS);
for (let i = 0; i < N_FREQ_BINS; i++) {
allFreqs[i] = (fMax * i) / (N_FREQ_BINS - 1);
}
const melMin = hzToMel(0);
const melMax = hzToMel(fMax);
const nPoints = nMels + 2;
const fPts = new Float64Array(nPoints);
for (let i = 0; i < nPoints; i++) {
fPts[i] = melToHz(melMin + ((melMax - melMin) * i) / (nPoints - 1));
}
const fDiff = new Float64Array(nPoints - 1);
for (let i = 0; i < nPoints - 1; i++) {
fDiff[i] = fPts[i + 1] - fPts[i];
}
const fb = new Float32Array(nMels * N_FREQ_BINS);
for (let m = 0; m < nMels; m++) {
const enorm = 2.0 / (fPts[m + 2] - fPts[m]); // slaney normalization
const fbOffset = m * N_FREQ_BINS;
for (let k = 0; k < N_FREQ_BINS; k++) {
const downSlope = (allFreqs[k] - fPts[m]) / fDiff[m];
const upSlope = (fPts[m + 2] - allFreqs[k]) / fDiff[m + 1];
fb[fbOffset + k] = Math.max(0, Math.min(downSlope, upSlope)) * enorm;
}
}
return fb;
}
/**
* Create a Hann window of length WIN_LENGTH, zero-padded to N_FFT.
*/
export function createPaddedHannWindow(): Float64Array {
const { N_FFT, WIN_LENGTH } = MEL_CONSTANTS;
const window = new Float64Array(N_FFT);
const padLeft = (N_FFT - WIN_LENGTH) >> 1; // 56
for (let n = 0; n < WIN_LENGTH; n++) {
window[padLeft + n] = 0.5 * (1 - Math.cos((2 * Math.PI * n) / (WIN_LENGTH - 1)));
}
return window;
}
/**
* Precompute FFT twiddle factors for a given size N.
*/
export function precomputeTwiddles(N: number): { cos: Float64Array; sin: Float64Array } {
const half = N >> 1;
const cos = new Float64Array(half);
const sin = new Float64Array(half);
for (let i = 0; i < half; i++) {
const angle = (-2 * Math.PI * i) / N;
cos[i] = Math.cos(angle);
sin[i] = Math.sin(angle);
}
return { cos, sin };
}
/**
* In-place radix-2 Cooley-Tukey FFT.
* @param re Real part (modified in-place)
* @param im Imaginary part (modified in-place)
* @param n FFT size (must be power of 2)
* @param tw Precomputed twiddle factors
*/
export function fft(re: Float64Array, im: Float64Array, n: number, tw: { cos: Float64Array; sin: Float64Array }): void {
// Bit-reversal permutation
for (let i = 1, j = 0; i < n; i++) {
let bit = n >> 1;
while (j & bit) { j ^= bit; bit >>= 1; }
j ^= bit;
if (i < j) {
let tmp = re[i]; re[i] = re[j]; re[j] = tmp;
tmp = im[i]; im[i] = im[j]; im[j] = tmp;
}
}
// Cooley-Tukey butterfly
for (let size = 2; size <= n; size <<= 1) {
const half = size >> 1;
const step = n / size;
for (let i = 0; i < n; i += size) {
for (let j = 0; j < half; j++) {
const idx = j * step;
const tRe = re[i + j + half] * tw.cos[idx] - im[i + j + half] * tw.sin[idx];
const tIm = re[i + j + half] * tw.sin[idx] + im[i + j + half] * tw.cos[idx];
re[i + j + half] = re[i + j] - tRe;
im[i + j + half] = im[i + j] - tIm;
re[i + j] += tRe;
im[i + j] += tIm;
}
}
}
}
/**
* Apply pre-emphasis filter to audio samples.
* @param chunk Raw audio chunk
* @param lastSample Last sample from previous chunk (for continuity)
* @param coeff Pre-emphasis coefficient (default 0.97)
* @returns Pre-emphasized samples
*/
export function preemphasize(chunk: Float32Array, lastSample: number = 0, coeff: number = MEL_CONSTANTS.PREEMPH): Float32Array {
const out = new Float32Array(chunk.length);
out[0] = chunk[0] - coeff * lastSample;
for (let i = 1; i < chunk.length; i++) {
out[i] = chunk[i] - coeff * chunk[i - 1];
}
return out;
}
/**
* Compute a single mel spectrogram frame from pre-emphasized audio.
* @param preemphAudio Full pre-emphasized audio buffer
* @param frameIdx Frame index
* @param hannWindow Pre-computed Hann window
* @param twiddles Pre-computed FFT twiddle factors
* @param melFilterbank Pre-computed mel filterbank
* @param nMels Number of mel bins
* @returns Raw (un-normalized) log-mel values for this frame
*/
export function computeMelFrame(
preemphAudio: Float32Array,
frameIdx: number,
hannWindow: Float64Array,
twiddles: { cos: Float64Array; sin: Float64Array },
melFilterbank: Float32Array,
nMels: number,
): Float32Array {
const { N_FFT, HOP_LENGTH, N_FREQ_BINS, LOG_ZERO_GUARD } = MEL_CONSTANTS;
const pad = N_FFT >> 1; // 256
const frameStart = frameIdx * HOP_LENGTH - pad;
const preemphLen = preemphAudio.length;
// Window the frame
const fftRe = new Float64Array(N_FFT);
const fftIm = new Float64Array(N_FFT);
for (let k = 0; k < N_FFT; k++) {
const idx = frameStart + k;
const sample = (idx >= 0 && idx < preemphLen) ? preemphAudio[idx] : 0;
fftRe[k] = sample * hannWindow[k];
fftIm[k] = 0;
}
// FFT
fft(fftRe, fftIm, N_FFT, twiddles);
// Power spectrum
const power = new Float32Array(N_FREQ_BINS);
for (let k = 0; k < N_FREQ_BINS; k++) {
power[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k];
}
// Mel filterbank multiply + log
const melFrame = new Float32Array(nMels);
for (let m = 0; m < nMels; m++) {
let melVal = 0;
const fbOff = m * N_FREQ_BINS;
for (let k = 0; k < N_FREQ_BINS; k++) {
melVal += power[k] * melFilterbank[fbOff + k];
}
melFrame[m] = Math.log(melVal + LOG_ZERO_GUARD);
}
return melFrame;
}
/**
* Normalize mel features per-feature with Bessel-corrected variance.
* @param features Flat array [nMels Γ— T], mel-major layout
* @param nMels Number of mel bins
* @param T Number of time frames
* @returns Normalized features (new array)
*/
export function normalizeMelFeatures(features: Float32Array, nMels: number, T: number): Float32Array {
const out = new Float32Array(features.length);
for (let m = 0; m < nMels; m++) {
const base = m * T;
// Copy and compute mean
let sum = 0;
for (let t = 0; t < T; t++) {
out[base + t] = features[base + t];
sum += features[base + t];
}
const mean = sum / T;
// Variance
let varSum = 0;
for (let t = 0; t < T; t++) {
const d = out[base + t] - mean;
varSum += d * d;
}
const invStd = T > 1
? 1.0 / (Math.sqrt(varSum / (T - 1)) + 1e-5)
: 0;
// Normalize
for (let t = 0; t < T; t++) {
out[base + t] = (out[base + t] - mean) * invStd;
}
}
return out;
}
/**
* Convert sample offset to frame index.
*/
export function sampleToFrame(sampleOffset: number): number {
return Math.floor(sampleOffset / MEL_CONSTANTS.HOP_LENGTH);
}