Spaces:

ysdede
/

keet-streaming

Running

App Files Files Community

keet-streaming / src /lib /audio /AudioEngine.ts

ysdede

feat(space): migrate Hugging Face Space to keet SolidJS app

b8cc2bf 25 days ago

raw

history blame contribute delete

43 kB

	import { AudioEngine as IAudioEngine, AudioEngineConfig, AudioSegment, IRingBuffer, AudioMetrics } from './types';
	import { RingBuffer } from './RingBuffer';
	import { AudioSegmentProcessor, ProcessedSegment } from './AudioSegmentProcessor';
	import { resampleLinear } from './utils';

	/** Duration of the visualization buffer in seconds */
	const VISUALIZATION_BUFFER_DURATION = 30;

	/**
	* AudioEngine implementation for capturing audio, buffering it, and performing VAD.
	* Uses AudioSegmentProcessor for robust speech detection (incl. lookback).
	*/
	export class AudioEngine implements IAudioEngine {
	private config: AudioEngineConfig;
	private ringBuffer: IRingBuffer;
	private audioProcessor: AudioSegmentProcessor; // Replaces EnergyVAD
	private deviceId: string \| null = null;

	private audioContext: AudioContext \| null = null;
	private mediaStream: MediaStream \| null = null;
	private workletNode: AudioWorkletNode \| null = null;
	private sourceNode: MediaStreamAudioSourceNode \| null = null;

	// AnalyserNode for oscilloscope waveform (native getByteTimeDomainData)
	private analyserNode: AnalyserNode \| null = null;
	private analyserSourceNode: MediaStreamAudioSourceNode \| null = null;
	private analyserGainNode: GainNode \| null = null;
	private analyserTimeBuffer: Uint8Array \| null = null;
	private waveformOut: Float32Array \| null = null;
	private readonly ANALYSER_FFT_SIZE = 256;
	private readonly ANALYSER_SMOOTHING = 0.3; // Low = fast oscilloscope response

	// Track device vs target sample rates
	private deviceSampleRate: number = 48000;
	private targetSampleRate: number = 16000;

	private currentEnergy: number = 0;

	private segmentCallbacks: Array<(segment: AudioSegment) => void> = [];

	// Fixed-window streaming state (v3 token streaming mode)
	private windowCallbacks: Array<{
	windowDuration: number;
	overlapDuration: number;
	triggerInterval: number;
	callback: (audio: Float32Array, startTime: number) => void;
	lastWindowEnd: number; // Frame offset of last window end
	}> = [];

	// Resampled audio chunk callbacks (for mel worker, etc.)
	private audioChunkCallbacks: Array<(chunk: Float32Array) => void> = [];

	// SMA buffer for energy calculation
	private energyHistory: number[] = [];

	// Last N energy values for bar visualizer (oldest first when read)
	private energyBarHistory: number[] = [];
	private readonly BAR_LEVELS_SIZE = 64;

	// Visualization Summary Buffer (Low-Res Min/Max pairs)
	private visualizationSummary: Float32Array \| null = null;
	private visualizationSummaryPosition: number = 0;
	private readonly VIS_SUMMARY_SIZE = 2000; // 2000 min/max pairs for 30 seconds = 15ms resolution

	// Raw visualization buffer (still kept for higher-res requests if needed, but summary is preferred)
	private visualizationBuffer: Float32Array \| null = null;
	private visualizationBufferPosition: number = 0;
	private visualizationBufferSize: number = 0;

	// Metrics for UI components
	private metrics: AudioMetrics = {
	currentEnergy: 0,
	averageEnergy: 0,
	peakEnergy: 0,
	noiseFloor: 0.01,
	currentSNR: 0,
	isSpeaking: false,
	};

	// Subscribers for visualization updates
	private visualizationCallbacks: Array<(data: Float32Array, metrics: AudioMetrics, bufferEndTime: number) => void> = [];
	private lastVisualizationNotifyTime: number = 0;
	private readonly VISUALIZATION_NOTIFY_INTERVAL_MS = 16; // ~60fps for responsive oscilloscope

	// Recent segments for visualization (stores timing info only)
	private recentSegments: Array<{ startTime: number; endTime: number; isProcessed: boolean }> = [];
	private readonly MAX_SEGMENTS_FOR_VISUALIZATION = 50;

	constructor(config: Partial<AudioEngineConfig> = {}) {
	this.config = {
	sampleRate: 16000,
	bufferDuration: 120,
	energyThreshold: 0.08, // Match legacy UI project 'medium'
	minSpeechDuration: 240, // Match legacy UI project
	minSilenceDuration: 400, // Match legacy UI project
	maxSegmentDuration: 4.8, // Match legacy UI project

	// Advanced VAD defaults
	lookbackDuration: 0.120,
	speechHangover: 0.16,
	minEnergyIntegral: 22,
	minEnergyPerSecond: 5,
	useAdaptiveEnergyThresholds: true,
	adaptiveEnergyIntegralFactor: 25.0,
	adaptiveEnergyPerSecondFactor: 10.0,
	minAdaptiveEnergyIntegral: 3,
	minAdaptiveEnergyPerSecond: 1,
	maxSilenceWithinSpeech: 0.160,
	endingSpeechTolerance: 0.240,
	...config,
	};

	this.deviceId = this.config.deviceId \|\| null;
	this.targetSampleRate = this.config.sampleRate;

	// RingBuffer operates at TARGET sample rate (16kHz)
	this.ringBuffer = new RingBuffer(this.targetSampleRate, this.config.bufferDuration);

	// Initialize AudioSegmentProcessor
	this.audioProcessor = new AudioSegmentProcessor({
	sampleRate: this.targetSampleRate,
	energyThreshold: this.config.energyThreshold,
	minSpeechDuration: this.config.minSpeechDuration,
	silenceThreshold: this.config.minSilenceDuration,
	maxSegmentDuration: this.config.maxSegmentDuration,
	lookbackDuration: this.config.lookbackDuration,
	maxSilenceWithinSpeech: this.config.maxSilenceWithinSpeech,
	endingSpeechTolerance: this.config.endingSpeechTolerance,
	snrThreshold: 3.0,
	minSnrThreshold: 1.0,
	noiseFloorAdaptationRate: 0.05,
	fastAdaptationRate: 0.15,
	minBackgroundDuration: 1.0,
	energyRiseThreshold: 0.08
	});

	// Initialize visualization buffer (30 seconds at target sample rate)
	this.visualizationBufferSize = Math.round(this.targetSampleRate * VISUALIZATION_BUFFER_DURATION);
	this.visualizationBuffer = new Float32Array(this.visualizationBufferSize);
	this.visualizationBufferPosition = 0;

	// Initialize visualization summary (2000 points for 30s)
	this.visualizationSummary = new Float32Array(this.VIS_SUMMARY_SIZE * 2);
	this.visualizationSummaryPosition = 0;

	console.log('[AudioEngine] Initialized with config:', this.config);
	}

	private isWorkletInitialized = false;

	async init(): Promise<void> {
	// Request microphone permission with optional deviceId
	try {
	if (this.mediaStream) {
	this.mediaStream.getTracks().forEach(t => t.stop());
	}

	const constraints: MediaStreamConstraints = {
	audio: {
	deviceId: this.deviceId ? { exact: this.deviceId } : undefined,
	channelCount: 1,
	echoCancellation: false,
	noiseSuppression: false,
	autoGainControl: false,
	},
	};

	console.log('[AudioEngine] Requesting microphone:', constraints);
	this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
	console.log('[AudioEngine] Microphone stream acquired:', this.mediaStream.id);
	} catch (err) {
	console.error('[AudioEngine] Failed to get media stream:', err);
	throw err;
	}

	const track = this.mediaStream!.getAudioTracks()[0];
	const trackSettings = track?.getSettings?.();
	// Device sample rate (what the mic gives us)
	this.deviceSampleRate = trackSettings?.sampleRate ?? 48000;
	console.log('[AudioEngine] Device sample rate:', this.deviceSampleRate, '-> Target:', this.targetSampleRate);

	if (this.audioContext && this.audioContext.sampleRate !== this.deviceSampleRate) {
	await this.audioContext.close();
	this.audioContext = null;
	}
	if (!this.audioContext) {
	this.audioContext = new AudioContext({
	sampleRate: this.deviceSampleRate,
	latencyHint: 'interactive',
	});
	console.log('[AudioEngine] Created AudioContext:', this.audioContext.state, 'sampleRate:', this.audioContext.sampleRate);
	}

	// Re-initialize components with correct rates
	this.ringBuffer = new RingBuffer(this.targetSampleRate, this.config.bufferDuration);

	// Update processor config
	this.audioProcessor = new AudioSegmentProcessor({
	sampleRate: this.targetSampleRate,
	energyThreshold: this.config.energyThreshold,
	minSpeechDuration: this.config.minSpeechDuration,
	silenceThreshold: this.config.minSilenceDuration,
	maxSegmentDuration: this.config.maxSegmentDuration,
	});

	if (!this.isWorkletInitialized) {
	const windowDuration = 0.080;
	const processorCode = `
	class CaptureProcessor extends AudioWorkletProcessor {
	constructor(options) {
	super(options);
	const opts = options?.processorOptions \|\| {};
	this.inputSampleRate = opts.inputSampleRate \|\| 16000;
	this.targetSampleRate = opts.targetSampleRate \|\| this.inputSampleRate;
	this.ratio = this.inputSampleRate / this.targetSampleRate;
	this.bufferSize = Math.round(${windowDuration} * this.inputSampleRate);
	this.buffer = new Float32Array(this.bufferSize);
	this.index = 0;
	this._lastLog = 0;
	}

	_emitChunk() {
	let out;
	let maxAbs = 0;

	if (this.targetSampleRate === this.inputSampleRate) {
	out = new Float32Array(this.bufferSize);
	for (let i = 0; i < this.bufferSize; i++) {
	const v = this.buffer[i];
	out[i] = v;
	const a = v < 0 ? -v : v;
	if (a > maxAbs) maxAbs = a;
	}
	} else {
	const outLength = Math.floor(this.bufferSize / this.ratio);
	out = new Float32Array(outLength);
	for (let i = 0; i < outLength; i++) {
	const srcIndex = i * this.ratio;
	const srcIndexFloor = Math.floor(srcIndex);
	const srcIndexCeil = Math.min(srcIndexFloor + 1, this.bufferSize - 1);
	const t = srcIndex - srcIndexFloor;
	const v = this.buffer[srcIndexFloor] * (1 - t) + this.buffer[srcIndexCeil] * t;
	out[i] = v;
	const a = v < 0 ? -v : v;
	if (a > maxAbs) maxAbs = a;
	}
	}

	this.port.postMessage(
	{ type: 'audio', samples: out, sampleRate: this.targetSampleRate, maxAbs },
	[out.buffer]
	);
	}

	process(inputs) {
	const input = inputs[0];
	if (!input \|\| !input[0]) return true;

	const channelData = input[0];

	// Buffer the data
	for (let i = 0; i < channelData.length; i++) {
	this.buffer[this.index++] = channelData[i];

	if (this.index >= this.bufferSize) {
	this._emitChunk();
	this.index = 0;

	// Debug log every ~5 seconds
	const now = Date.now();
	if (now - this._lastLog > 5000) {
	this.port.postMessage({ type: 'log', message: '[AudioWorklet] Active' });
	this._lastLog = now;
	}
	}
	}

	return true;
	}
	}
	registerProcessor('capture-processor', CaptureProcessor);
	`;
	const blob = new Blob([processorCode], { type: 'application/javascript' });
	const url = URL.createObjectURL(blob);
	try {
	await this.audioContext.audioWorklet.addModule(url);
	this.isWorkletInitialized = true;
	console.log('[AudioEngine] AudioWorklet module loaded');
	} catch (err) {
	console.error('[AudioEngine] Failed to load worklet:', err);
	if (err instanceof Error && err.name === 'InvalidStateError') {
	// Ignore if already registered
	this.isWorkletInitialized = true;
	}
	}
	}

	// Re-create worklet node if needed (it might handle dispose differently, but safe to new)
	if (this.workletNode) this.workletNode.disconnect();

	this.workletNode = new AudioWorkletNode(this.audioContext, 'capture-processor', {
	processorOptions: { inputSampleRate: this.deviceSampleRate, targetSampleRate: this.targetSampleRate },
	});
	this.workletNode.port.onmessage = (event: MessageEvent<any>) => {
	if (event.data?.type === 'audio' && event.data.samples instanceof Float32Array) {
	this.handleAudioChunk(event.data.samples, event.data.maxAbs, event.data.sampleRate);
	} else if (event.data instanceof Float32Array) {
	this.handleAudioChunk(event.data, undefined, this.deviceSampleRate);
	} else if (event.data?.type === 'log') {
	console.log(event.data.message);
	}
	};
	this.workletNode.onprocessorerror = (e) => {
	console.error('[AudioEngine] Worklet processor error:', e);
	};

	// Reconnect source node
	this.sourceNode?.disconnect();
	this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream);
	this.sourceNode.connect(this.workletNode);

	// AnalyserNode branch for lightweight preview bars (native FFT, no mel worker)
	this.disposeAnalyser();
	this.analyserSourceNode = this.audioContext.createMediaStreamSource(this.mediaStream);
	this.analyserNode = this.audioContext.createAnalyser();
	this.analyserNode.fftSize = this.ANALYSER_FFT_SIZE;
	this.analyserNode.smoothingTimeConstant = this.ANALYSER_SMOOTHING;
	this.analyserTimeBuffer = new Uint8Array(this.analyserNode.fftSize);
	this.waveformOut = new Float32Array(this.analyserNode.fftSize);

	this.analyserGainNode = this.audioContext.createGain();
	this.analyserGainNode.gain.value = 0;

	this.analyserSourceNode.connect(this.analyserNode);
	this.analyserNode.connect(this.analyserGainNode);
	this.analyserGainNode.connect(this.audioContext.destination);

	// Keep graph alive
	this.workletNode.connect(this.audioContext.destination);
	console.log('[AudioEngine] Graph connected: Source -> Worklet, AnalyserNode for oscilloscope');
	}

	async start(): Promise<void> {
	if (!this.mediaStream \|\| !this.audioContext \|\| !this.workletNode) {
	await this.init();
	}

	if (this.audioContext?.state === 'suspended') {
	await this.audioContext.resume();
	}
	}

	stop(): void {
	if (this.audioContext?.state === 'running') {
	this.audioContext.suspend();
	}
	}

	/**
	* Reset buffers and VAD state for a new session while keeping the audio graph.
	* Aligns visualization + segment timebase to 0, matching legacy UI project behavior.
	*/
	reset(): void {
	// Reset audio/VAD state
	this.ringBuffer.reset();
	this.audioProcessor.reset();
	this.currentEnergy = 0;

	// Reset metrics
	this.metrics = {
	currentEnergy: 0,
	averageEnergy: 0,
	peakEnergy: 0,
	noiseFloor: 0.01,
	currentSNR: 0,
	isSpeaking: false,
	};

	// Clear segment history used by the visualizer
	this.recentSegments = [];
	this.energyBarHistory = [];

	// Reset visualization buffer
	if (this.visualizationBuffer) {
	this.visualizationBuffer.fill(0);
	}
	this.visualizationBufferPosition = 0;

	// Reset windowed streaming cursors
	for (const entry of this.windowCallbacks) {
	entry.lastWindowEnd = 0;
	}

	// Push a blank update so UI clears stale waveform/segments
	this.notifyVisualizationUpdate();
	}

	getCurrentEnergy(): number {
	return this.currentEnergy;
	}

	/** Oscilloscope waveform from AnalyserNode.getByteTimeDomainData (native, fast). Values -1..1. */
	getBarLevels(): Float32Array {
	if (this.analyserNode && this.analyserTimeBuffer && this.waveformOut) {
	(this.analyserNode as { getByteTimeDomainData(array: Uint8Array): void }).getByteTimeDomainData(this.analyserTimeBuffer);
	for (let i = 0; i < this.analyserTimeBuffer.length; i++) {
	this.waveformOut[i] = (this.analyserTimeBuffer[i] - 128) / 128; // 0..255 -> -1..1
	}
	return this.waveformOut;
	}
	const out = new Float32Array(this.BAR_LEVELS_SIZE);
	const h = this.energyBarHistory;
	const start = h.length <= this.BAR_LEVELS_SIZE ? 0 : h.length - this.BAR_LEVELS_SIZE;
	for (let i = 0; i < this.BAR_LEVELS_SIZE; i++) {
	const idx = start + i;
	out[i] = idx < h.length ? Math.min(1, Math.max(0, h[idx])) : 0;
	}
	return out;
	}

	getSignalMetrics(): { noiseFloor: number; snr: number; threshold: number; snrThreshold: number } {
	const stats = this.audioProcessor.getStats();
	return {
	noiseFloor: stats.noiseFloor ?? 0.0001,
	snr: stats.snr ?? 0,
	threshold: this.config.energyThreshold,
	snrThreshold: stats.snrThreshold ?? 3.0
	};
	}

	isSpeechActive(): boolean {
	return this.audioProcessor.getStateInfo().inSpeech;
	}

	getRingBuffer(): IRingBuffer {
	return this.ringBuffer;
	}

	onSpeechSegment(callback: (segment: AudioSegment) => void): () => void {
	this.segmentCallbacks.push(callback);
	return () => {
	this.segmentCallbacks = this.segmentCallbacks.filter((cb) => cb !== callback);
	};
	}

	/**
	* Subscribe to fixed-window chunks for token streaming mode.
	* Fires every triggerInterval seconds with windowDuration of audio.
	*/
	onWindowChunk(
	windowDuration: number,
	overlapDuration: number,
	triggerInterval: number,
	callback: (audio: Float32Array, startTime: number) => void
	): () => void {
	const entry = {
	windowDuration,
	overlapDuration,
	triggerInterval,
	callback,
	lastWindowEnd: 0, // Will be set on first chunk
	};
	this.windowCallbacks.push(entry);

	return () => {
	this.windowCallbacks = this.windowCallbacks.filter((e) => e !== entry);
	};
	}

	/**
	* Subscribe to every resampled audio chunk (16kHz).
	* Used to feed the continuous mel producer worker.
	* Returns an unsubscribe function.
	*/
	onAudioChunk(callback: (chunk: Float32Array) => void): () => void {
	this.audioChunkCallbacks.push(callback);
	return () => {
	this.audioChunkCallbacks = this.audioChunkCallbacks.filter((cb) => cb !== callback);
	};
	}

	updateConfig(config: Partial<AudioEngineConfig>): void {
	this.config = { ...this.config, ...config };

	// Update processor config
	if (config.energyThreshold !== undefined) this.audioProcessor.setThreshold(config.energyThreshold);
	if (config.minSpeechDuration !== undefined) this.audioProcessor.setMinSpeechDuration(config.minSpeechDuration);
	if (config.minSilenceDuration !== undefined) this.audioProcessor.setSilenceLength(config.minSilenceDuration);
	if (config.maxSegmentDuration !== undefined) this.audioProcessor.setMaxSegmentDuration(config.maxSegmentDuration);

	// Advanced VAD updates
	if (config.lookbackDuration !== undefined) this.audioProcessor.setLookbackDuration(config.lookbackDuration);
	if (config.overlapDuration !== undefined) this.audioProcessor.setOverlapDuration(config.overlapDuration);
	if (config.maxSilenceWithinSpeech !== undefined) this.audioProcessor.setMaxSilenceWithinSpeech(config.maxSilenceWithinSpeech);
	if (config.endingSpeechTolerance !== undefined) this.audioProcessor.setEndingSpeechTolerance(config.endingSpeechTolerance);

	if (config.snrThreshold !== undefined) this.audioProcessor.setSnrThreshold(config.snrThreshold);
	if (config.minSnrThreshold !== undefined) this.audioProcessor.setMinSnrThreshold(config.minSnrThreshold);
	}

	async setDevice(deviceId: string): Promise<void> {
	this.deviceId = deviceId;
	await this.init();

	// Reconnect if running
	if (this.audioContext && this.workletNode) {
	this.sourceNode?.disconnect();
	this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream!);
	this.sourceNode.connect(this.workletNode);
	}
	}

	private disposeAnalyser(): void {
	this.analyserSourceNode?.disconnect();
	this.analyserNode?.disconnect();
	this.analyserGainNode?.disconnect();
	this.analyserSourceNode = null;
	this.analyserNode = null;
	this.analyserGainNode = null;
	this.analyserTimeBuffer = null;
	this.waveformOut = null;
	}

	dispose(): void {
	this.stop();
	this.disposeAnalyser();
	this.mediaStream?.getTracks().forEach(track => track.stop());
	this.audioContext?.close();
	this.audioContext = null;
	this.mediaStream = null;
	this.workletNode = null;
	this.sourceNode = null;
	}

	private handleAudioChunk(rawChunk: Float32Array, precomputedMaxAbs?: number, chunkSampleRate?: number): void {
	// 0. Ensure chunk is at target sample rate (resample only if needed)
	const sampleRate = chunkSampleRate ?? this.targetSampleRate;
	const needsResample = sampleRate !== this.targetSampleRate;
	const chunk = needsResample
	? resampleLinear(rawChunk, sampleRate, this.targetSampleRate)
	: rawChunk;

	// Calculate chunk energy (Peak Amplitude) + SMA for VAD compatibility
	let maxAbs = (!needsResample && precomputedMaxAbs !== undefined) ? precomputedMaxAbs : 0;
	if (precomputedMaxAbs === undefined \|\| needsResample) {
	for (let i = 0; i < chunk.length; i++) {
	const abs = Math.abs(chunk[i]);
	if (abs > maxAbs) maxAbs = abs;
	}
	}

	// SMA Smoothing (matching legacy UI project logic)
	this.energyHistory.push(maxAbs);
	if (this.energyHistory.length > 6) {
	this.energyHistory.shift();
	}
	const energy = this.energyHistory.reduce((a: number, b: number) => a + b, 0) / this.energyHistory.length;

	this.currentEnergy = energy;
	this.energyBarHistory.push(energy);
	if (this.energyBarHistory.length > this.BAR_LEVELS_SIZE) {
	this.energyBarHistory.shift();
	}

	// Log when energy crosses threshold if state is close to changing
	const isSpeech = energy > this.config.energyThreshold;
	const wasSpeaking = this.metrics.isSpeaking;
	if (isSpeech !== wasSpeaking) {
	console.debug(`[AudioEngine] Energy threshold crossed: ${energy.toFixed(6)} > ${this.config.energyThreshold} = ${isSpeech}`);
	}

	// 1. Write to ring buffer before any callbacks can transfer the chunk.
	this.ringBuffer.write(chunk);

	const endFrame = this.ringBuffer.getCurrentFrame();

	// 2. Process VAD on resampled audio
	// The processor uses its own internal history for lookback, but we pull full audio from ring buffer later.
	const currentTime = this.ringBuffer.getCurrentTime();
	const segments = this.audioProcessor.processAudioData(chunk, currentTime, energy);

	// 2.5 Update visualization buffer
	this.updateVisualizationBuffer(chunk);

	// 2.6 Update metrics
	const stats = this.audioProcessor.getStats();
	const stateInfo = this.audioProcessor.getStateInfo();

	this.metrics.currentEnergy = energy;
	this.metrics.averageEnergy = this.metrics.averageEnergy * 0.95 + energy * 0.05;
	this.metrics.peakEnergy = Math.max(this.metrics.peakEnergy * 0.99, energy);
	this.metrics.noiseFloor = stats.noiseFloor ?? 0.01;
	this.metrics.currentSNR = stats.snr ?? 0;
	this.metrics.isSpeaking = stateInfo.inSpeech;

	// Periodic debug log
	if (Math.random() < 0.05) {
	console.debug(`[AudioEngine] Metrics: E=${energy.toFixed(6)}, NF=${this.metrics.noiseFloor.toFixed(6)}, SNR=${this.metrics.currentSNR.toFixed(2)}, Speaking=${this.metrics.isSpeaking}`);
	}

	// 3. Handle segments
	if (segments.length > 0) {
	for (const seg of segments) {
	// Apply lookback and overlap adjustments matching legacy UI project
	const lookbackDuration = this.config.lookbackDuration ?? 0.120;
	const startTime = Math.max(0, seg.startTime - lookbackDuration);

	// Calculate the sample positions for audio extraction
	const startFrame = Math.round(startTime * this.targetSampleRate);
	const endFrame = Math.round(seg.endTime * this.targetSampleRate);

	// Retrieval with padding (hangover)
	const speechHangover = this.config.speechHangover ?? 0.16;
	const paddedEndFrame = Math.min(
	this.ringBuffer.getCurrentFrame(),
	endFrame + Math.round(speechHangover * this.targetSampleRate)
	);

	try {
	const audioData = this.ringBuffer.read(startFrame, paddedEndFrame);

	// Calculate precise energy metrics for filtering
	const metrics = this.calculateSegmentEnergyMetrics(audioData, this.targetSampleRate);

	// Normalize power to 16kHz equivalent
	const normalizedPowerAt16k = metrics.averagePower * 16000;
	const normalizedEnergyIntegralAt16k = normalizedPowerAt16k * metrics.duration;

	// Adaptive threshold calculation
	let minEnergyIntegralThreshold = this.config.minEnergyIntegral ?? 22;
	let minEnergyPerSecondThreshold = this.config.minEnergyPerSecond ?? 5;

	if (this.config.useAdaptiveEnergyThresholds) {
	const windowSize = this.config.windowSize ?? Math.round(0.080 * this.targetSampleRate);
	const normalizedNoiseFloor = windowSize > 0 ? this.metrics.noiseFloor / windowSize : 0;
	const noiseFloorAt16k = normalizedNoiseFloor * 16000;

	const adaptiveMinEnergyIntegral = noiseFloorAt16k * (this.config.adaptiveEnergyIntegralFactor ?? 25.0);
	minEnergyIntegralThreshold = Math.max(this.config.minAdaptiveEnergyIntegral ?? 3, adaptiveMinEnergyIntegral);

	const adaptiveMinEnergyPerSecond = noiseFloorAt16k * (this.config.adaptiveEnergyPerSecondFactor ?? 10.0);
	minEnergyPerSecondThreshold = Math.max(this.config.minAdaptiveEnergyPerSecond ?? 1, adaptiveMinEnergyPerSecond);
	}

	const isValidSpeech =
	metrics.duration >= (this.config.minSpeechDuration / 1000) &&
	normalizedPowerAt16k >= minEnergyPerSecondThreshold &&
	normalizedEnergyIntegralAt16k >= minEnergyIntegralThreshold;

	if (isValidSpeech) {
	const audioSegment: AudioSegment = {
	startFrame: startFrame,
	endFrame: paddedEndFrame,
	duration: metrics.duration,
	averageEnergy: metrics.averagePower,
	timestamp: Date.now(),
	};
	this.notifySegment(audioSegment);
	} else {
	console.log('[AudioEngine] Filtered out noise segment:', {
	duration: metrics.duration,
	power: normalizedPowerAt16k,
	integral: normalizedEnergyIntegralAt16k
	});
	}
	} catch (err) {
	console.warn('[AudioEngine] Failed to extract audio for validation:', err);
	}
	}
	}

	// 6. Fixed-window streaming (v3 token streaming mode)
	this.processWindowCallbacks(endFrame);

	// 7. Notify audio chunk subscribers AFTER internal processing.
	// Callbacks may transfer the chunk's buffer; do not use `chunk` after this.
	for (const cb of this.audioChunkCallbacks) {
	cb(chunk);
	}

	// 8. Notify visualization subscribers
	this.notifyVisualizationUpdate();
	}

	/**
	* Helper to read audio from ring buffer and calculate energy metrics for a detected segment.
	*/
	private calculateSegmentEnergyMetrics(audioData: Float32Array, sampleRate: number): { averagePower: number; duration: number; numSamples: number } {
	if (!audioData \|\| audioData.length === 0) {
	return { averagePower: 0, duration: 0, numSamples: 0 };
	}

	const numSamples = audioData.length;
	let sumOfSquares = 0;

	for (let i = 0; i < numSamples; i++) {
	sumOfSquares += audioData[i] * audioData[i];
	}

	const duration = numSamples / sampleRate;
	const averagePower = numSamples > 0 ? sumOfSquares / numSamples : 0;

	return {
	averagePower,
	duration,
	numSamples
	};
	}

	/**
	* Process fixed-window callbacks for token streaming mode.
	* Fires when enough audio has accumulated for a new window.
	*/
	private processWindowCallbacks(currentFrame: number): void {
	for (const entry of this.windowCallbacks) {
	const windowFrames = Math.floor(entry.windowDuration * this.targetSampleRate);
	const stepFrames = Math.floor(entry.triggerInterval * this.targetSampleRate);

	// Initialize lastWindowEnd on first call
	if (entry.lastWindowEnd === 0) {
	entry.lastWindowEnd = currentFrame;
	continue;
	}

	// Check if we have enough new audio for the next window
	const framesSinceLastWindow = currentFrame - entry.lastWindowEnd;
	if (framesSinceLastWindow >= stepFrames) {
	// Calculate window boundaries
	const windowEnd = currentFrame;
	const windowStart = windowEnd - windowFrames;

	// Ensure we have enough data in the ring buffer
	const baseOffset = this.ringBuffer.getBaseFrameOffset();
	if (windowStart >= baseOffset) {
	try {
	const audio = this.ringBuffer.read(windowStart, windowEnd);
	const startTime = windowStart / this.targetSampleRate;

	entry.callback(audio, startTime);
	entry.lastWindowEnd = windowEnd;
	} catch (e) {
	console.warn('[AudioEngine] Window read failed:', e);
	}
	}
	}
	}
	}

	private notifySegment(segment: AudioSegment): void {
	// Track segment for visualization
	this.recentSegments.push({
	startTime: segment.startFrame / this.targetSampleRate,
	endTime: segment.endFrame / this.targetSampleRate,
	isProcessed: false
	});

	// Limit segments count
	if (this.recentSegments.length > this.MAX_SEGMENTS_FOR_VISUALIZATION) {
	this.recentSegments.shift();
	}

	this.segmentCallbacks.forEach((cb) => cb(segment));
	}

	/**
	* Get recent segments for visualization.
	*/
	getSegmentsForVisualization(): Array<{ startTime: number; endTime: number; isProcessed: boolean }> {
	const segments = [...this.recentSegments];

	// Add pending segment if speech is currently active
	const vadState = this.audioProcessor.getStateInfo();
	if (vadState.inSpeech && vadState.speechStartTime !== null) {
	segments.push({
	startTime: vadState.speechStartTime,
	endTime: this.ringBuffer.getCurrentTime(),
	isProcessed: false // Pending
	});
	}

	return segments;
	}

	/**
	* Mark a segment as processed (for visualization color coding).
	*/
	markSegmentProcessed(startTime: number): void {
	const segment = this.recentSegments.find(s => Math.abs(s.startTime - startTime) < 0.1);
	if (segment) {
	segment.isProcessed = true;
	}
	}

	/**
	* Update the visualization buffer and summary with new audio data.
	*/
	private updateVisualizationBuffer(chunk: Float32Array): void {
	if (!this.visualizationBuffer \|\| !this.visualizationSummary) return;

	const chunkLength = chunk.length;
	const bufferLength = this.visualizationBufferSize;

	// 1. Update raw circular buffer
	if (chunkLength >= bufferLength) {
	this.visualizationBuffer.set(chunk.subarray(chunkLength - bufferLength));
	this.visualizationBufferPosition = 0;
	} else {
	const endPosition = this.visualizationBufferPosition + chunkLength;
	if (endPosition <= bufferLength) {
	this.visualizationBuffer.set(chunk, this.visualizationBufferPosition);
	this.visualizationBufferPosition = endPosition % bufferLength;
	} else {
	const firstPart = bufferLength - this.visualizationBufferPosition;
	this.visualizationBuffer.set(chunk.subarray(0, firstPart), this.visualizationBufferPosition);
	this.visualizationBuffer.set(chunk.subarray(firstPart), 0);
	this.visualizationBufferPosition = (chunkLength - firstPart) % bufferLength;
	}
	}

	// 2. Update summary buffer (Low-res min/max pairs)
	// Each point in VIS_SUMMARY_SIZE represents bufferLength / VIS_SUMMARY_SIZE samples
	const samplesPerPoint = bufferLength / this.VIS_SUMMARY_SIZE;
	const numNewPoints = Math.round(chunkLength / samplesPerPoint);

	for (let i = 0; i < numNewPoints; i++) {
	const start = Math.floor(i * samplesPerPoint);
	const end = Math.min(chunkLength, Math.floor((i + 1) * samplesPerPoint));
	if (start >= end) continue;

	let min = chunk[start];
	let max = chunk[start];
	for (let s = start + 1; s < end; s++) {
	const v = chunk[s];
	if (v < min) min = v;
	if (v > max) max = v;
	}

	// Write to circular summary
	const targetIdx = this.visualizationSummaryPosition * 2;
	this.visualizationSummary[targetIdx] = min;
	this.visualizationSummary[targetIdx + 1] = max;
	this.visualizationSummaryPosition = (this.visualizationSummaryPosition + 1) % this.VIS_SUMMARY_SIZE;
	}
	}

	/**
	* Get visualization data subsampled to fit the target width.
	* Returns min/max pairs for each pixel to preserve peaks in the waveform.
	* Zero-allocation except for the returned result.
	* @param targetWidth - The desired number of data points (e.g., canvas width).
	* @returns Float32Array containing alternating min/max values, length targetWidth * 2.
	*/
	getVisualizationData(targetWidth: number): Float32Array {
	if (!this.visualizationSummary \|\| !targetWidth \|\| targetWidth <= 0) {
	return new Float32Array(0);
	}

	// If targetWidth is close to or less than our summary size, use the summary (MUCH faster)
	if (targetWidth <= this.VIS_SUMMARY_SIZE) {
	const subsampledBuffer = new Float32Array(targetWidth * 2);
	const samplesPerTarget = this.VIS_SUMMARY_SIZE / targetWidth;

	for (let i = 0; i < targetWidth; i++) {
	const rangeStart = i * samplesPerTarget;
	const rangeEnd = (i + 1) * samplesPerTarget;

	let minVal = 0;
	let maxVal = 0;
	let first = true;

	for (let s = Math.floor(rangeStart); s < Math.floor(rangeEnd); s++) {
	const idx = ((this.visualizationSummaryPosition + s) % this.VIS_SUMMARY_SIZE) * 2;
	const vMin = this.visualizationSummary[idx];
	const vMax = this.visualizationSummary[idx + 1];

	if (first) {
	minVal = vMin;
	maxVal = vMax;
	first = false;
	} else {
	if (vMin < minVal) minVal = vMin;
	if (vMax > maxVal) maxVal = vMax;
	}
	}

	subsampledBuffer[i * 2] = minVal;
	subsampledBuffer[i * 2 + 1] = maxVal;
	}
	return subsampledBuffer;
	}

	return this.getVisualizationDataFromRaw(targetWidth);
	}

	private getVisualizationDataFromRaw(targetWidth: number): Float32Array {
	if (!this.visualizationBuffer) return new Float32Array(0);
	const buffer = this.visualizationBuffer;
	const bufferLength = this.visualizationBufferSize;
	const pos = this.visualizationBufferPosition;
	const samplesPerPoint = bufferLength / targetWidth;
	const subsampledBuffer = new Float32Array(targetWidth * 2);

	// Logical index s maps to physical index:
	// if s < wrapS: pos + s
	// else: s - wrapS (which is s - (bufferLength - pos) = s + pos - bufferLength)
	const wrapS = bufferLength - pos;

	for (let i = 0; i < targetWidth; i++) {
	const startS = Math.floor(i * samplesPerPoint);
	const endS = Math.floor((i + 1) * samplesPerPoint);

	let minVal = 0;
	let maxVal = 0;
	let first = true;

	// Part 1: Before wrap (Logical indices < wrapS)
	// Physical indices: pos + s
	const end1 = (endS < wrapS) ? endS : wrapS;
	if (startS < end1) {
	let p = pos + startS;
	const pEnd = pos + end1;

	if (first && p < pEnd) {
	const val = buffer[p];
	minVal = val;
	maxVal = val;
	first = false;
	p++;
	}

	for (; p < pEnd; p++) {
	const val = buffer[p];
	if (val < minVal) minVal = val;
	else if (val > maxVal) maxVal = val;
	}
	}

	// Part 2: After wrap (Logical indices >= wrapS)
	// Physical indices: s - wrapS
	const start2 = (startS > wrapS) ? startS : wrapS;
	if (start2 < endS) {
	let p = start2 - wrapS;
	const pEnd = endS - wrapS;

	if (first && p < pEnd) {
	const val = buffer[p];
	minVal = val;
	maxVal = val;
	first = false;
	p++;
	}

	for (; p < pEnd; p++) {
	const val = buffer[p];
	if (val < minVal) minVal = val;
	else if (val > maxVal) maxVal = val;
	}
	}

	subsampledBuffer[i * 2] = minVal;
	subsampledBuffer[i * 2 + 1] = maxVal;
	}
	return subsampledBuffer;
	}


	/**
	* Get current audio metrics for UI visualization.
	*/
	getMetrics(): AudioMetrics {
	return { ...this.metrics };
	}

	/**
	* Get current time in seconds (for waveform time markers).
	*/
	getCurrentTime(): number {
	return this.ringBuffer.getCurrentTime();
	}

	/**
	* Get the visualization buffer duration in seconds.
	*/
	getVisualizationDuration(): number {
	return VISUALIZATION_BUFFER_DURATION;
	}

	/**
	* Subscribe to visualization updates.
	* Callback is invoked after each audio chunk is processed.
	*/
	onVisualizationUpdate(callback: (data: Float32Array, metrics: AudioMetrics, bufferEndTime: number) => void): () => void {
	this.visualizationCallbacks.push(callback);
	return () => {
	this.visualizationCallbacks = this.visualizationCallbacks.filter((cb) => cb !== callback);
	};
	}

	/**
	* Notify visualization subscribers with updated data.
	* Throttled to ~30fps to avoid UI stuttering.
	*/
	private notifyVisualizationUpdate(): void {
	const now = performance.now();
	if (now - this.lastVisualizationNotifyTime < this.VISUALIZATION_NOTIFY_INTERVAL_MS) {
	return;
	}
	this.lastVisualizationNotifyTime = now;

	const data = this.getVisualizationData(400); // 400 points is enough for modern displays and saves CPU
	const bufferEndTime = this.ringBuffer.getCurrentTime();
	this.visualizationCallbacks.forEach((cb) => cb(data, this.getMetrics(), bufferEndTime));
	}
	}