Spaces:

ysdede
/

keet-streaming

Running

App Files Files Community

keet-streaming / src /lib /vad /SileroVAD.ts

ysdede

feat(space): migrate Hugging Face Space to keet SolidJS app

b8cc2bf 2 months ago

raw

history blame contribute delete

7.4 kB

	import type { SileroVADConfig, SileroVADResult } from './types';

	// Use the globally-exposed ort from parakeet.js / onnxruntime-web
	declare const ort: any;

	/**
	* Silero VAD ONNX model runner for browser-side voice activity detection.
	*
	* Loads the Silero VAD v5 ONNX model via onnxruntime-web and runs inference
	* on 512-sample (32ms at 16kHz) audio chunks, returning a speech probability.
	*
	* The model maintains internal LSTM hidden state (h/c tensors) across calls,
	* so chunks must be fed sequentially.
	*
	* Reference: onnx-community/silero-vad on HuggingFace
	* Python reference: onnx-asr/src/onnx_asr/models/silero.py
	*/
	export class SileroVAD {
	private config: SileroVADConfig;
	private session: any \| null = null;
	private initialized: boolean = false;

	// Internal LSTM state tensors (persisted across calls)
	private stateH: any \| null = null;
	private stateC: any \| null = null;
	private srTensor: any \| null = null;

	// Silero model constants
	// For 16kHz: hop_size=512 (32ms), context_size=64
	// For 8kHz: hop_size=256 (32ms), context_size=32
	private readonly hopSize: number;
	private readonly contextSize: number;

	// Context buffer for prepending to each chunk
	private contextBuffer: Float32Array;

	constructor(config: Partial<SileroVADConfig> = {}) {
	this.config = {
	modelUrl: 'https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx',
	threshold: 0.5,
	negThreshold: config.threshold !== undefined ? config.threshold - 0.15 : 0.35,
	sampleRate: 16000,
	...config,
	};

	this.hopSize = this.config.sampleRate === 16000 ? 512 : 256;
	this.contextSize = this.config.sampleRate === 16000 ? 64 : 32;
	this.contextBuffer = new Float32Array(this.contextSize);
	}

	/**
	* Initialize the ONNX session and create initial state tensors.
	* Must be called before process().
	*/
	async init(modelUrl?: string): Promise<void> {
	if (this.initialized) return;

	const url = modelUrl \|\| this.config.modelUrl;

	// Ensure ort is available (exposed by parakeet.js backend.js)
	const ortLib = typeof ort !== 'undefined' ? ort : (globalThis as any).ort;
	if (!ortLib) {
	throw new Error(
	'onnxruntime-web (ort) not found. Ensure parakeet.js backend is initialized first.'
	);
	}

	// Create ONNX inference session with WASM backend (VAD is lightweight, no need for WebGPU)
	this.session = await ortLib.InferenceSession.create(url, {
	executionProviders: ['wasm'],
	graphOptimizationLevel: 'all',
	});

	// Initialize LSTM state tensors: shape [2, 1, 128] for batch=1
	// Silero v5 uses a combined state tensor of shape [2, 1, 128]
	this.stateH = new ortLib.Tensor('float32', new Float32Array(2 * 1 * 128), [2, 1, 128]);
	// Sample rate tensor
	this.srTensor = new ortLib.Tensor('int64', BigInt64Array.from([BigInt(this.config.sampleRate)]), [1]);

	this.initialized = true;
	}

	/**
	* Process a single audio chunk and return speech probability.
	*
	* The chunk should be exactly hopSize (512) samples at 16kHz.
	* If the chunk is a different size, it will be padded or truncated.
	*
	* @param chunk - Float32Array of mono PCM samples at the configured sample rate
	* @returns SileroVADResult with speech probability
	*/
	async process(chunk: Float32Array): Promise<SileroVADResult> {
	if (!this.initialized \|\| !this.session) {
	throw new Error('SileroVAD not initialized. Call init() first.');
	}

	const ortLib = typeof ort !== 'undefined' ? ort : (globalThis as any).ort;

	// Build the input: [context_size + hop_size] samples
	// Prepend the context buffer (last contextSize samples from previous chunk)
	const inputLength = this.contextSize + this.hopSize;
	const inputData = new Float32Array(inputLength);
	inputData.set(this.contextBuffer, 0);

	// Copy or pad/truncate the chunk to fit hopSize
	if (chunk.length >= this.hopSize) {
	inputData.set(chunk.subarray(0, this.hopSize), this.contextSize);
	} else {
	inputData.set(chunk, this.contextSize);
	// Remaining is already zero-filled
	}

	// Update context buffer: take the last contextSize samples from the current chunk
	if (chunk.length >= this.contextSize) {
	this.contextBuffer.set(chunk.subarray(chunk.length - this.contextSize));
	} else {
	// Shift existing context and append what we have
	const shift = this.contextSize - chunk.length;
	this.contextBuffer.copyWithin(0, chunk.length);
	this.contextBuffer.set(chunk, shift);
	}

	// Create input tensor: shape [1, context_size + hop_size]
	const inputTensor = new ortLib.Tensor('float32', inputData, [1, inputLength]);

	// Run inference
	const feeds: Record<string, any> = {
	input: inputTensor,
	state: this.stateH,
	sr: this.srTensor,
	};

	const results = await this.session.run(feeds);

	// Extract output probability and new state
	const outputData = results.output.data as Float32Array;
	const probability = outputData[0];

	// Update persistent state
	this.stateH = results.stateN;

	const isSpeech = probability >= this.config.threshold;

	return {
	probability,
	isSpeech,
	timestamp: Date.now(),
	};
	}

	/**
	* Process an audio buffer that may be larger than hopSize.
	* Splits into hopSize chunks and returns probabilities for each.
	*/
	async processBuffer(audio: Float32Array): Promise<SileroVADResult[]> {
	const results: SileroVADResult[] = [];
	for (let offset = 0; offset < audio.length; offset += this.hopSize) {
	const end = Math.min(offset + this.hopSize, audio.length);
	const chunk = audio.subarray(offset, end);
	const result = await this.process(chunk);
	results.push(result);
	}
	return results;
	}

	/**
	* Reset the internal LSTM state. Call when starting a new audio stream.
	*/
	reset(): void {
	if (!this.initialized) return;

	const ortLib = typeof ort !== 'undefined' ? ort : (globalThis as any).ort;
	this.stateH = new ortLib.Tensor('float32', new Float32Array(2 * 1 * 128), [2, 1, 128]);
	this.contextBuffer.fill(0);
	}

	/**
	* Release the ONNX session and free resources.
	*/
	async dispose(): Promise<void> {
	if (this.session) {
	await this.session.release();
	this.session = null;
	}
	this.stateH = null;
	this.srTensor = null;
	this.initialized = false;
	}

	/**
	* Whether the model is ready for inference.
	*/
	isReady(): boolean {
	return this.initialized;
	}

	/**
	* Get the expected chunk size in samples.
	*/
	getHopSize(): number {
	return this.hopSize;
	}

	/**
	* Get the current configuration.
	*/
	getConfig(): SileroVADConfig {
	return { ...this.config };
	}
	}