keet-streaming / src /lib /audio /mel.worker.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* Keet - Continuous Mel Producer Worker
*
* Runs in a separate Web Worker thread, continuously computing raw log-mel
* spectrogram frames as audio arrives. When the inference pipeline needs
* features for a time window, it requests normalized features from this worker.
*
* Architecture:
* AudioEngine β†’ pushAudio(chunk) β†’ mel.worker computes raw mel frames incrementally
* Inference trigger β†’ getFeatures(startFrame, endFrame) β†’ mel.worker normalizes & returns
*
* This decouples mel computation from the inference thread entirely.
* Features are always ready when the encoder needs them β†’ zero wait for preprocessing.
*
* Imports from mel-math.ts (local module, no external deps).
*/
import {
MEL_CONSTANTS,
createMelFilterbank,
createPaddedHannWindow,
precomputeTwiddles,
fft,
normalizeMelFeatures,
sampleToFrame,
} from './mel-math';
const { N_FFT, HOP_LENGTH, N_FREQ_BINS, PREEMPH, LOG_ZERO_GUARD } = MEL_CONSTANTS;
// ═══════════════════════════════════════════════════════════════════════════
// Worker State
// ═══════════════════════════════════════════════════════════════════════════
let nMels = 128;
// Pre-emphasized audio buffer (compacted after each pushAudio to stay bounded).
// Only retains samples needed for the next mel frame's FFT window.
let preemphBuffer = new Float32Array(0);
let preemphBaseIdx = 0; // Global sample index corresponding to preemphBuffer[0]
let preemphLen = 0; // Number of valid samples currently in preemphBuffer
let lastRawSample = 0;
let totalSamples = 0;
// Raw mel frame buffer: fixed-size circular, mel-major layout [nMels * maxFrames].
// For mel bin m at frame t: rawMelBuffer[m * maxFrames + (t % maxFrames)].
let rawMelBuffer: Float32Array | null = null;
let maxFrames = 0;
let computedFrames = 0; // Monotonic: total frames computed (next frame index)
let baseFrame = 0; // Oldest frame still available in the circular buffer
// Pre-allocated FFT buffers (reused per frame)
let fftRe: Float64Array;
let fftIm: Float64Array;
let powerBuf: Float32Array;
// Pre-computed constants
let melFilterbank: Float32Array;
let hannWindow: Float64Array;
let twiddles: { cos: Float64Array; sin: Float64Array };
// Logging throttle for getFeatures (avoid console spam)
let lastGetFeaturesLogTime = 0;
const GET_FEATURES_LOG_INTERVAL = 5000; // Log every 5 seconds max
// ═══════════════════════════════════════════════════════════════════════════
// Initialization
// ═══════════════════════════════════════════════════════════════════════════
function init(config: { nMels?: number }) {
const t0 = performance.now();
nMels = config.nMels || 128;
// Build mel computation constants
melFilterbank = createMelFilterbank(nMels);
hannWindow = createPaddedHannWindow();
twiddles = precomputeTwiddles(N_FFT);
// Allocate FFT buffers
fftRe = new Float64Array(N_FFT);
fftIm = new Float64Array(N_FFT);
powerBuf = new Float32Array(N_FREQ_BINS);
// Fixed-size circular mel buffer for ~120 seconds (12000 frames at 100 fps).
// Old frames are silently overwritten; no reallocation ever occurs.
maxFrames = 12000;
rawMelBuffer = new Float32Array(nMels * maxFrames);
computedFrames = 0;
baseFrame = 0;
// Pre-emphasized audio buffer: only needs to hold the FFT overlap window
// plus one incoming chunk. Compacted after each pushAudio call so it
// stays bounded to roughly N_FFT + chunk_size samples.
preemphBuffer = new Float32Array(N_FFT + 16000); // N_FFT overlap + up to 1s chunk
preemphBaseIdx = 0;
preemphLen = 0;
lastRawSample = 0;
totalSamples = 0;
const melBufMB = (nMels * maxFrames * 4 / 1024 / 1024).toFixed(1);
console.log(`[MelWorker] Initialized: nMels=${nMels}, maxFrames=${maxFrames} (circular), ${melBufMB}MB mel buffer, preemph=${preemphBuffer.length} samples, init ${(performance.now() - t0).toFixed(1)} ms`);
}
// ═══════════════════════════════════════════════════════════════════════════
// Incremental Mel Computation
// ═══════════════════════════════════════════════════════════════════════════
function pushAudio(chunk: Float32Array) {
if (!rawMelBuffer) return;
const chunkLen = chunk.length;
if (chunkLen === 0) return;
const t0 = performance.now();
// 1. Pre-emphasize the new chunk incrementally.
// Grow preemph buffer if needed (safety net; compaction below keeps it small).
if (preemphLen + chunkLen > preemphBuffer.length) {
const newSize = Math.max(preemphBuffer.length * 2, preemphLen + chunkLen);
const newBuf = new Float32Array(newSize);
newBuf.set(preemphBuffer.subarray(0, preemphLen));
preemphBuffer = newBuf;
}
// Pre-emphasize
preemphBuffer[preemphLen] = chunk[0] - PREEMPH * lastRawSample;
for (let i = 1; i < chunkLen; i++) {
preemphBuffer[preemphLen + i] = chunk[i] - PREEMPH * chunk[i - 1];
}
preemphLen += chunkLen;
lastRawSample = chunk[chunkLen - 1];
totalSamples += chunkLen;
// 2. Compute new valid frames
const newTotalFrames = Math.floor(totalSamples / HOP_LENGTH);
if (newTotalFrames <= computedFrames) {
compactPreemphBuffer();
return;
}
// 3. Compute each new frame, writing into the circular mel buffer.
// No reallocation: old frames are silently overwritten via modulo.
const pad = N_FFT >> 1; // 256
for (let t = computedFrames; t < newTotalFrames; t++) {
const frameStart = t * HOP_LENGTH - pad;
const circularT = t % maxFrames;
// a) Window the frame (using local index into compacted preemph buffer)
for (let k = 0; k < N_FFT; k++) {
const globalIdx = frameStart + k;
const localIdx = globalIdx - preemphBaseIdx;
const sample = (localIdx >= 0 && localIdx < preemphLen) ? preemphBuffer[localIdx] : 0;
fftRe[k] = sample * hannWindow[k];
fftIm[k] = 0;
}
// b) 512-point FFT
fft(fftRe, fftIm, N_FFT, twiddles);
// c) Power spectrum
for (let k = 0; k < N_FREQ_BINS; k++) {
powerBuf[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k];
}
// d) Mel filterbank multiply + log (circular write)
for (let m = 0; m < nMels; m++) {
let melVal = 0;
const fbOff = m * N_FREQ_BINS;
for (let k = 0; k < N_FREQ_BINS; k++) {
melVal += powerBuf[k] * melFilterbank[fbOff + k];
}
rawMelBuffer![m * maxFrames + circularT] = Math.log(melVal + LOG_ZERO_GUARD);
}
}
const prevFrames = computedFrames;
computedFrames = newTotalFrames;
// Advance baseFrame when the circular buffer has wrapped
if (computedFrames - baseFrame > maxFrames) {
baseFrame = computedFrames - maxFrames;
}
// 4. Compact preemph buffer: discard samples no longer needed
compactPreemphBuffer();
const newFramesComputed = newTotalFrames - prevFrames;
if (newFramesComputed > 0) {
const elapsed = performance.now() - t0;
// Log every ~50 chunks (~4s) to avoid spam
if (computedFrames % 50 < newFramesComputed) {
console.log(`[MelWorker] pushAudio: +${chunkLen} samples, +${newFramesComputed} frames, total ${computedFrames} frames (${(totalSamples / 16000).toFixed(1)}s), buf [${baseFrame}..${computedFrames}), preemph ${preemphLen} samples, ${elapsed.toFixed(1)} ms`);
}
}
}
/**
* Compact the preemph buffer by discarding samples that are no longer needed.
* The next mel frame to be computed (computedFrames) requires samples starting
* at global index (computedFrames * HOP_LENGTH - N_FFT/2). Everything before
* that can be safely discarded.
*/
function compactPreemphBuffer() {
const pad = N_FFT >> 1;
const nextFrameStart = computedFrames * HOP_LENGTH - pad;
const discardBefore = Math.max(0, nextFrameStart);
const discardLocal = discardBefore - preemphBaseIdx;
if (discardLocal > 0 && discardLocal < preemphLen) {
// Shift remaining samples to front of buffer
const remaining = preemphLen - discardLocal;
preemphBuffer.copyWithin(0, discardLocal, discardLocal + remaining);
preemphLen = remaining;
preemphBaseIdx = discardBefore;
} else if (discardLocal >= preemphLen) {
// All current samples are stale
preemphLen = 0;
preemphBaseIdx = discardBefore;
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Feature Extraction (normalize a requested range)
// ═══════════════════════════════════════════════════════════════════════════
/**
* Extract mel features for a frame range.
*
* @param startFrame - Start frame index
* @param endFrame - End frame index (exclusive)
* @param normalize - If true (default), apply per-feature mean/variance normalization
* (required for ASR). If false, return raw log-mel values (for visualization with
* fixed dB scaling to avoid "gain hunting" during silence).
*
* PERFORMANCE NOTE (2026-02-09): When normalize=false, the caller (e.g. visualizer)
* still incurs the cost of extracting frames from the circular buffer. If visualization
* performance becomes an issue, consider:
* 1. Reducing visualizer update frequency
* 2. Caching/reusing extracted frames between draws
* 3. Downsampling the spectrogram (skip frames for display)
*/
function getFeatures(startFrame: number, endFrame: number, normalize: boolean = true): {
features: Float32Array;
T: number;
melBins: number;
} | null {
const t0 = performance.now();
if (!rawMelBuffer || computedFrames === 0) {
console.warn(`[MelWorker] getFeatures: no data (computedFrames=${computedFrames})`);
return null;
}
// Clamp to available circular range [baseFrame, computedFrames)
const sf = Math.max(baseFrame, startFrame);
const ef = Math.min(computedFrames, endFrame);
const T = ef - sf;
if (T <= 0) {
console.warn(`[MelWorker] getFeatures: empty range (requested ${startFrame}..${endFrame}, available ${baseFrame}..${computedFrames})`);
return null;
}
// Extract the requested window from circular buffer (mel-major layout [nMels, T])
const raw = new Float32Array(nMels * T);
for (let m = 0; m < nMels; m++) {
const srcRowBase = m * maxFrames;
const dstBase = m * T;
for (let i = 0; i < T; i++) {
const circularIdx = (sf + i) % maxFrames;
raw[dstBase + i] = rawMelBuffer![srcRowBase + circularIdx];
}
}
// Optionally normalize (ASR requires normalized; visualizer uses raw for fixed dB scale)
const features = normalize ? normalizeMelFeatures(raw, nMels, T) : raw;
// Throttled logging to avoid console spam (was causing noticeable CPU overhead)
const now = performance.now();
if (now - lastGetFeaturesLogTime > GET_FEATURES_LOG_INTERVAL) {
lastGetFeaturesLogTime = now;
const elapsed = now - t0;
console.log(`[MelWorker] getFeatures: frames ${sf}..${ef} (${T} frames, ${(T * HOP_LENGTH / 16000).toFixed(2)}s), normalize=${normalize}, ${elapsed.toFixed(1)} ms, buf [${baseFrame}..${computedFrames})`);
}
return { features, T, melBins: nMels };
}
// ═══════════════════════════════════════════════════════════════════════════
// Last mel frame (for equalizer-style bar display)
// ═══════════════════════════════════════════════════════════════════════════
/**
* Return the last computed mel frame (raw log-mel, one value per bin).
* Averages the last 2 frames for smoother display. Returns null if no frames.
*/
function getLastMelFrame(): Float32Array | null {
if (!rawMelBuffer || computedFrames === 0) return null;
const out = new Float32Array(nMels);
const lastIdx = (computedFrames - 1 + maxFrames) % maxFrames;
const prevIdx = computedFrames >= 2 ? (computedFrames - 2 + maxFrames) % maxFrames : lastIdx;
for (let m = 0; m < nMels; m++) {
const base = m * maxFrames;
out[m] = 0.5 * (rawMelBuffer[base + lastIdx] + rawMelBuffer[base + prevIdx]);
}
return out;
}
// ═══════════════════════════════════════════════════════════════════════════
// Reset
// ═══════════════════════════════════════════════════════════════════════════
function reset() {
preemphLen = 0;
preemphBaseIdx = 0;
lastRawSample = 0;
totalSamples = 0;
computedFrames = 0;
baseFrame = 0;
console.log('[MelWorker] Reset');
}
// ═══════════════════════════════════════════════════════════════════════════
// Message Handler
// ═══════════════════════════════════════════════════════════════════════════
self.onmessage = (e: MessageEvent) => {
const { type, payload, id } = e.data;
try {
switch (type) {
case 'INIT': {
init(payload || {});
postMessage({ type: 'INIT_DONE', id });
break;
}
case 'PUSH_AUDIO': {
pushAudio(payload);
// No response needed β€” fire and forget for continuous production
break;
}
case 'GET_FEATURES': {
const { startSample, endSample, normalize = true } = payload;
const startFrame = sampleToFrame(startSample);
const endFrame = sampleToFrame(endSample);
const result = getFeatures(startFrame, endFrame, normalize);
if (result) {
// Transfer the features buffer for zero-copy
postMessage(
{ type: 'GET_FEATURES_DONE', payload: result, id },
[result.features.buffer] as any
);
} else {
postMessage({
type: 'GET_FEATURES_DONE',
payload: null,
id
});
}
break;
}
case 'GET_STATUS': {
postMessage({
type: 'GET_STATUS_DONE',
payload: {
totalSamples,
computedFrames,
bufferCapacityFrames: maxFrames,
melBins: nMels,
},
id,
});
break;
}
case 'GET_LAST_MEL_FRAME': {
const frame = getLastMelFrame();
if (frame) {
postMessage(
{ type: 'GET_LAST_MEL_FRAME_DONE', payload: { melFrame: frame }, id },
[frame.buffer] as any
);
} else {
postMessage({ type: 'GET_LAST_MEL_FRAME_DONE', payload: null, id });
}
break;
}
case 'RESET': {
reset();
postMessage({ type: 'RESET_DONE', id });
break;
}
default:
console.warn('[MelWorker] Unknown message type:', type);
}
} catch (err: any) {
console.error('[MelWorker] Error:', err);
postMessage({ type: 'ERROR', payload: err.message, id });
}
};
console.log('[MelWorker] Worker script loaded');