Spaces:

ysdede
/

keet-streaming

Running

App Files Files Community

keet-streaming / src /lib /audio /mel.worker.ts

ysdede

feat(space): migrate Hugging Face Space to keet SolidJS app

b8cc2bf about 1 month ago

raw

history blame contribute delete

17.5 kB

	/**
	* Keet - Continuous Mel Producer Worker
	*
	* Runs in a separate Web Worker thread, continuously computing raw log-mel
	* spectrogram frames as audio arrives. When the inference pipeline needs
	* features for a time window, it requests normalized features from this worker.
	*
	* Architecture:
	* AudioEngine → pushAudio(chunk) → mel.worker computes raw mel frames incrementally
	* Inference trigger → getFeatures(startFrame, endFrame) → mel.worker normalizes & returns
	*
	* This decouples mel computation from the inference thread entirely.
	* Features are always ready when the encoder needs them → zero wait for preprocessing.
	*
	* Imports from mel-math.ts (local module, no external deps).
	*/

	import {
	MEL_CONSTANTS,
	createMelFilterbank,
	createPaddedHannWindow,
	precomputeTwiddles,
	fft,
	normalizeMelFeatures,
	sampleToFrame,
	} from './mel-math';

	const { N_FFT, HOP_LENGTH, N_FREQ_BINS, PREEMPH, LOG_ZERO_GUARD } = MEL_CONSTANTS;

	// ═══════════════════════════════════════════════════════════════════════════
	// Worker State
	// ═══════════════════════════════════════════════════════════════════════════

	let nMels = 128;

	// Pre-emphasized audio buffer (compacted after each pushAudio to stay bounded).
	// Only retains samples needed for the next mel frame's FFT window.
	let preemphBuffer = new Float32Array(0);
	let preemphBaseIdx = 0; // Global sample index corresponding to preemphBuffer[0]
	let preemphLen = 0; // Number of valid samples currently in preemphBuffer
	let lastRawSample = 0;
	let totalSamples = 0;

	// Raw mel frame buffer: fixed-size circular, mel-major layout [nMels * maxFrames].
	// For mel bin m at frame t: rawMelBuffer[m * maxFrames + (t % maxFrames)].
	let rawMelBuffer: Float32Array \| null = null;
	let maxFrames = 0;
	let computedFrames = 0; // Monotonic: total frames computed (next frame index)
	let baseFrame = 0; // Oldest frame still available in the circular buffer

	// Pre-allocated FFT buffers (reused per frame)
	let fftRe: Float64Array;
	let fftIm: Float64Array;
	let powerBuf: Float32Array;

	// Pre-computed constants
	let melFilterbank: Float32Array;
	let hannWindow: Float64Array;
	let twiddles: { cos: Float64Array; sin: Float64Array };

	// Logging throttle for getFeatures (avoid console spam)
	let lastGetFeaturesLogTime = 0;
	const GET_FEATURES_LOG_INTERVAL = 5000; // Log every 5 seconds max

	// ═══════════════════════════════════════════════════════════════════════════
	// Initialization
	// ═══════════════════════════════════════════════════════════════════════════

	function init(config: { nMels?: number }) {
	const t0 = performance.now();
	nMels = config.nMels \|\| 128;

	// Build mel computation constants
	melFilterbank = createMelFilterbank(nMels);
	hannWindow = createPaddedHannWindow();
	twiddles = precomputeTwiddles(N_FFT);

	// Allocate FFT buffers
	fftRe = new Float64Array(N_FFT);
	fftIm = new Float64Array(N_FFT);
	powerBuf = new Float32Array(N_FREQ_BINS);

	// Fixed-size circular mel buffer for ~120 seconds (12000 frames at 100 fps).
	// Old frames are silently overwritten; no reallocation ever occurs.
	maxFrames = 12000;
	rawMelBuffer = new Float32Array(nMels * maxFrames);
	computedFrames = 0;
	baseFrame = 0;

	// Pre-emphasized audio buffer: only needs to hold the FFT overlap window
	// plus one incoming chunk. Compacted after each pushAudio call so it
	// stays bounded to roughly N_FFT + chunk_size samples.
	preemphBuffer = new Float32Array(N_FFT + 16000); // N_FFT overlap + up to 1s chunk
	preemphBaseIdx = 0;
	preemphLen = 0;
	lastRawSample = 0;
	totalSamples = 0;

	const melBufMB = (nMels * maxFrames * 4 / 1024 / 1024).toFixed(1);
	console.log(`[MelWorker] Initialized: nMels=${nMels}, maxFrames=${maxFrames} (circular), ${melBufMB}MB mel buffer, preemph=${preemphBuffer.length} samples, init ${(performance.now() - t0).toFixed(1)} ms`);
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Incremental Mel Computation
	// ═══════════════════════════════════════════════════════════════════════════

	function pushAudio(chunk: Float32Array) {
	if (!rawMelBuffer) return;

	const chunkLen = chunk.length;
	if (chunkLen === 0) return;

	const t0 = performance.now();

	// 1. Pre-emphasize the new chunk incrementally.
	// Grow preemph buffer if needed (safety net; compaction below keeps it small).
	if (preemphLen + chunkLen > preemphBuffer.length) {
	const newSize = Math.max(preemphBuffer.length * 2, preemphLen + chunkLen);
	const newBuf = new Float32Array(newSize);
	newBuf.set(preemphBuffer.subarray(0, preemphLen));
	preemphBuffer = newBuf;
	}

	// Pre-emphasize
	preemphBuffer[preemphLen] = chunk[0] - PREEMPH * lastRawSample;
	for (let i = 1; i < chunkLen; i++) {
	preemphBuffer[preemphLen + i] = chunk[i] - PREEMPH * chunk[i - 1];
	}
	preemphLen += chunkLen;
	lastRawSample = chunk[chunkLen - 1];
	totalSamples += chunkLen;

	// 2. Compute new valid frames
	const newTotalFrames = Math.floor(totalSamples / HOP_LENGTH);
	if (newTotalFrames <= computedFrames) {
	compactPreemphBuffer();
	return;
	}

	// 3. Compute each new frame, writing into the circular mel buffer.
	// No reallocation: old frames are silently overwritten via modulo.
	const pad = N_FFT >> 1; // 256

	for (let t = computedFrames; t < newTotalFrames; t++) {
	const frameStart = t * HOP_LENGTH - pad;
	const circularT = t % maxFrames;

	// a) Window the frame (using local index into compacted preemph buffer)
	for (let k = 0; k < N_FFT; k++) {
	const globalIdx = frameStart + k;
	const localIdx = globalIdx - preemphBaseIdx;
	const sample = (localIdx >= 0 && localIdx < preemphLen) ? preemphBuffer[localIdx] : 0;
	fftRe[k] = sample * hannWindow[k];
	fftIm[k] = 0;
	}

	// b) 512-point FFT
	fft(fftRe, fftIm, N_FFT, twiddles);

	// c) Power spectrum
	for (let k = 0; k < N_FREQ_BINS; k++) {
	powerBuf[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k];
	}

	// d) Mel filterbank multiply + log (circular write)
	for (let m = 0; m < nMels; m++) {
	let melVal = 0;
	const fbOff = m * N_FREQ_BINS;
	for (let k = 0; k < N_FREQ_BINS; k++) {
	melVal += powerBuf[k] * melFilterbank[fbOff + k];
	}
	rawMelBuffer![m * maxFrames + circularT] = Math.log(melVal + LOG_ZERO_GUARD);
	}
	}

	const prevFrames = computedFrames;
	computedFrames = newTotalFrames;

	// Advance baseFrame when the circular buffer has wrapped
	if (computedFrames - baseFrame > maxFrames) {
	baseFrame = computedFrames - maxFrames;
	}

	// 4. Compact preemph buffer: discard samples no longer needed
	compactPreemphBuffer();

	const newFramesComputed = newTotalFrames - prevFrames;
	if (newFramesComputed > 0) {
	const elapsed = performance.now() - t0;
	// Log every ~50 chunks (~4s) to avoid spam
	if (computedFrames % 50 < newFramesComputed) {
	console.log(`[MelWorker] pushAudio: +${chunkLen} samples, +${newFramesComputed} frames, total ${computedFrames} frames (${(totalSamples / 16000).toFixed(1)}s), buf [${baseFrame}..${computedFrames}), preemph ${preemphLen} samples, ${elapsed.toFixed(1)} ms`);
	}
	}
	}

	/**
	* Compact the preemph buffer by discarding samples that are no longer needed.
	* The next mel frame to be computed (computedFrames) requires samples starting
	* at global index (computedFrames * HOP_LENGTH - N_FFT/2). Everything before
	* that can be safely discarded.
	*/
	function compactPreemphBuffer() {
	const pad = N_FFT >> 1;
	const nextFrameStart = computedFrames * HOP_LENGTH - pad;
	const discardBefore = Math.max(0, nextFrameStart);
	const discardLocal = discardBefore - preemphBaseIdx;

	if (discardLocal > 0 && discardLocal < preemphLen) {
	// Shift remaining samples to front of buffer
	const remaining = preemphLen - discardLocal;
	preemphBuffer.copyWithin(0, discardLocal, discardLocal + remaining);
	preemphLen = remaining;
	preemphBaseIdx = discardBefore;
	} else if (discardLocal >= preemphLen) {
	// All current samples are stale
	preemphLen = 0;
	preemphBaseIdx = discardBefore;
	}
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Feature Extraction (normalize a requested range)
	// ═══════════════════════════════════════════════════════════════════════════

	/**
	* Extract mel features for a frame range.
	*
	* @param startFrame - Start frame index
	* @param endFrame - End frame index (exclusive)
	* @param normalize - If true (default), apply per-feature mean/variance normalization
	* (required for ASR). If false, return raw log-mel values (for visualization with
	* fixed dB scaling to avoid "gain hunting" during silence).
	*
	* PERFORMANCE NOTE (2026-02-09): When normalize=false, the caller (e.g. visualizer)
	* still incurs the cost of extracting frames from the circular buffer. If visualization
	* performance becomes an issue, consider:
	* 1. Reducing visualizer update frequency
	* 2. Caching/reusing extracted frames between draws
	* 3. Downsampling the spectrogram (skip frames for display)
	*/
	function getFeatures(startFrame: number, endFrame: number, normalize: boolean = true): {
	features: Float32Array;
	T: number;
	melBins: number;
	} \| null {
	const t0 = performance.now();

	if (!rawMelBuffer \|\| computedFrames === 0) {
	console.warn(`[MelWorker] getFeatures: no data (computedFrames=${computedFrames})`);
	return null;
	}

	// Clamp to available circular range [baseFrame, computedFrames)
	const sf = Math.max(baseFrame, startFrame);
	const ef = Math.min(computedFrames, endFrame);
	const T = ef - sf;

	if (T <= 0) {
	console.warn(`[MelWorker] getFeatures: empty range (requested ${startFrame}..${endFrame}, available ${baseFrame}..${computedFrames})`);
	return null;
	}

	// Extract the requested window from circular buffer (mel-major layout [nMels, T])
	const raw = new Float32Array(nMels * T);
	for (let m = 0; m < nMels; m++) {
	const srcRowBase = m * maxFrames;
	const dstBase = m * T;
	for (let i = 0; i < T; i++) {
	const circularIdx = (sf + i) % maxFrames;
	raw[dstBase + i] = rawMelBuffer![srcRowBase + circularIdx];
	}
	}

	// Optionally normalize (ASR requires normalized; visualizer uses raw for fixed dB scale)
	const features = normalize ? normalizeMelFeatures(raw, nMels, T) : raw;

	// Throttled logging to avoid console spam (was causing noticeable CPU overhead)
	const now = performance.now();
	if (now - lastGetFeaturesLogTime > GET_FEATURES_LOG_INTERVAL) {
	lastGetFeaturesLogTime = now;
	const elapsed = now - t0;
	console.log(`[MelWorker] getFeatures: frames ${sf}..${ef} (${T} frames, ${(T * HOP_LENGTH / 16000).toFixed(2)}s), normalize=${normalize}, ${elapsed.toFixed(1)} ms, buf [${baseFrame}..${computedFrames})`);
	}

	return { features, T, melBins: nMels };
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Last mel frame (for equalizer-style bar display)
	// ═══════════════════════════════════════════════════════════════════════════

	/**
	* Return the last computed mel frame (raw log-mel, one value per bin).
	* Averages the last 2 frames for smoother display. Returns null if no frames.
	*/
	function getLastMelFrame(): Float32Array \| null {
	if (!rawMelBuffer \|\| computedFrames === 0) return null;
	const out = new Float32Array(nMels);
	const lastIdx = (computedFrames - 1 + maxFrames) % maxFrames;
	const prevIdx = computedFrames >= 2 ? (computedFrames - 2 + maxFrames) % maxFrames : lastIdx;
	for (let m = 0; m < nMels; m++) {
	const base = m * maxFrames;
	out[m] = 0.5 * (rawMelBuffer[base + lastIdx] + rawMelBuffer[base + prevIdx]);
	}
	return out;
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Reset
	// ═══════════════════════════════════════════════════════════════════════════

	function reset() {
	preemphLen = 0;
	preemphBaseIdx = 0;
	lastRawSample = 0;
	totalSamples = 0;
	computedFrames = 0;
	baseFrame = 0;
	console.log('[MelWorker] Reset');
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Message Handler
	// ═══════════════════════════════════════════════════════════════════════════

	self.onmessage = (e: MessageEvent) => {
	const { type, payload, id } = e.data;

	try {
	switch (type) {
	case 'INIT': {
	init(payload \|\| {});
	postMessage({ type: 'INIT_DONE', id });
	break;
	}

	case 'PUSH_AUDIO': {
	pushAudio(payload);
	// No response needed — fire and forget for continuous production
	break;
	}

	case 'GET_FEATURES': {
	const { startSample, endSample, normalize = true } = payload;
	const startFrame = sampleToFrame(startSample);
	const endFrame = sampleToFrame(endSample);
	const result = getFeatures(startFrame, endFrame, normalize);

	if (result) {
	// Transfer the features buffer for zero-copy
	postMessage(
	{ type: 'GET_FEATURES_DONE', payload: result, id },
	[result.features.buffer] as any
	);
	} else {
	postMessage({
	type: 'GET_FEATURES_DONE',
	payload: null,
	id
	});
	}
	break;
	}

	case 'GET_STATUS': {
	postMessage({
	type: 'GET_STATUS_DONE',
	payload: {
	totalSamples,
	computedFrames,
	bufferCapacityFrames: maxFrames,
	melBins: nMels,
	},
	id,
	});
	break;
	}

	case 'GET_LAST_MEL_FRAME': {
	const frame = getLastMelFrame();
	if (frame) {
	postMessage(
	{ type: 'GET_LAST_MEL_FRAME_DONE', payload: { melFrame: frame }, id },
	[frame.buffer] as any
	);
	} else {
	postMessage({ type: 'GET_LAST_MEL_FRAME_DONE', payload: null, id });
	}
	break;
	}

	case 'RESET': {
	reset();
	postMessage({ type: 'RESET_DONE', id });
	break;
	}

	default:
	console.warn('[MelWorker] Unknown message type:', type);
	}
	} catch (err: any) {
	console.error('[MelWorker] Error:', err);
	postMessage({ type: 'ERROR', payload: err.message, id });
	}
	};

	console.log('[MelWorker] Worker script loaded');