keet-streaming / src /lib /audio /AudioSegmentProcessor.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* Keet - Audio Segment Processor
* Ported from legacy UI project/AudioSegmentProcessor.js
*
* Sophisticated VAD-based segment processor with:
* - Speech onset detection with lookback
* - Rising energy trend analysis
* - Adaptive noise floor tracking
* - SNR-based speech detection
* - Proactive segment splitting for long utterances
*/
import { defaultAudioParams, windowDuration as DEFAULT_WINDOW_DURATION } from './audioParams';
/** Chunk metadata for speech tracking */
interface ChunkInfo {
time: number;
energy: number;
isSpeech: boolean;
snr: number;
}
/** Speech/silence statistics */
interface SegmentStats {
startTime: number;
endTime: number;
duration: number;
avgEnergy: number;
energyIntegral: number;
}
/** Statistics summary */
interface StatsSummary {
avgDuration: number;
avgEnergy: number;
avgEnergyIntegral: number;
}
/** Current stats snapshot */
interface CurrentStats {
silence: StatsSummary;
speech: StatsSummary;
noiseFloor: number;
snr: number;
snrThreshold: number;
minSnrThreshold: number;
energyRiseThreshold: number;
}
/** Processor state */
interface ProcessorState {
inSpeech: boolean;
speechStartTime: number | null;
silenceStartTime: number | null;
silenceCounter: number;
recentChunks: ChunkInfo[];
speechEnergies: number[];
silenceEnergies: number[];
speechStats: SegmentStats[];
silenceStats: SegmentStats[];
currentStats: CurrentStats;
segmentCounter: number;
noiseFloor: number;
recentEnergies: number[];
silenceDuration: number;
}
/** Segment output */
export interface ProcessedSegment {
startTime: number;
endTime: number;
duration: number;
}
/** Processor configuration */
export interface AudioSegmentProcessorConfig {
sampleRate: number;
windowSize: number;
minSpeechDuration: number;
silenceThreshold: number;
energyThreshold: number;
smaLength: number;
lookbackChunks: number;
overlapDuration: number;
lookbackDuration: number;
maxHistoryLength: number;
noiseFloorAdaptationRate: number;
fastAdaptationRate: number;
snrThreshold: number;
minBackgroundDuration: number;
minSnrThreshold: number;
energyRiseThreshold: number;
maxSegmentDuration: number;
maxSilenceWithinSpeech: number;
endingSpeechTolerance: number;
logger?: (message: string, data?: unknown) => void;
}
/**
* AudioSegmentProcessor - Sophisticated VAD with speech onset detection
*/
export class AudioSegmentProcessor {
private options: AudioSegmentProcessorConfig;
private state!: ProcessorState;
constructor(options: Partial<AudioSegmentProcessorConfig> = {}) {
const sampleRate = options.sampleRate ?? defaultAudioParams.sampleRate ?? 16000;
// Calculate window size based on sample rate (80ms window)
const windowSize = Math.round(DEFAULT_WINDOW_DURATION * sampleRate);
this.options = {
sampleRate,
minSpeechDuration: defaultAudioParams.minSpeechDuration,
silenceThreshold: defaultAudioParams.silenceLength,
energyThreshold: defaultAudioParams.audioThreshold,
smaLength: defaultAudioParams.smaLength,
lookbackChunks: defaultAudioParams.lookbackChunks,
overlapDuration: defaultAudioParams.overlapDuration,
lookbackDuration: defaultAudioParams.lookbackDuration,
maxHistoryLength: defaultAudioParams.maxHistoryLength,
noiseFloorAdaptationRate: defaultAudioParams.noiseFloorAdaptationRate,
fastAdaptationRate: defaultAudioParams.fastAdaptationRate,
snrThreshold: defaultAudioParams.snrThreshold,
minBackgroundDuration: defaultAudioParams.minBackgroundDuration,
minSnrThreshold: defaultAudioParams.minSnrThreshold,
energyRiseThreshold: defaultAudioParams.energyRiseThreshold,
maxSegmentDuration: defaultAudioParams.maxSegmentDuration,
maxSilenceWithinSpeech: defaultAudioParams.maxSilenceWithinSpeech,
endingSpeechTolerance: defaultAudioParams.endingSpeechTolerance,
logger: console.log,
...options,
// Ensure windowSize is recalculated if sampleRate was overridden
windowSize: Math.round(DEFAULT_WINDOW_DURATION * (options.sampleRate ?? sampleRate))
};
this.log('Initialized AudioSegmentProcessor', {
sampleRate: this.options.sampleRate,
windowSize: this.options.windowSize,
lookbackDuration: this.options.lookbackDuration,
overlapDuration: this.options.overlapDuration,
snrThreshold: this.options.snrThreshold,
minSnrThreshold: this.options.minSnrThreshold
});
this.reset();
}
private log(message: string, data?: unknown): void {
if (typeof this.options.logger === 'function') {
this.options.logger(`[AudioSegmentProcessor] ${message}`, data);
}
}
/**
* Process an audio chunk and return any detected segments.
*/
processAudioData(
chunk: Float32Array,
currentTime: number,
energy: number
): ProcessedSegment[] {
if (!chunk || !chunk.length) return [];
const segments: ProcessedSegment[] = [];
const isSpeech = energy > this.options.energyThreshold;
// Update silence duration tracking
if (!isSpeech) {
const chunkDurationSec = chunk.length / this.options.sampleRate;
this.state.silenceDuration += chunkDurationSec;
} else {
this.state.silenceDuration = 0;
}
// Update noise floor and calculate SNR
this.updateNoiseFloor(energy, isSpeech);
const snr = this.calculateSNR(energy);
// Track recent chunks for lookback
this.state.recentChunks.push({
time: currentTime,
energy,
isSpeech,
snr
});
if (this.state.recentChunks.length > this.options.maxHistoryLength * 10) {
this.state.recentChunks.shift();
}
// --- Proactive Segment Splitting ---
if (this.state.inSpeech && this.state.speechStartTime !== null) {
const currentSpeechDuration = currentTime - this.state.speechStartTime;
if (currentSpeechDuration > this.options.maxSegmentDuration) {
this.log('Splitting long segment', {
startTime: this.state.speechStartTime.toFixed(2),
splitTime: currentTime.toFixed(2),
duration: currentSpeechDuration.toFixed(2)
});
const segment = this.createSegment(this.state.speechStartTime, currentTime);
if (segment) {
segments.push(segment);
}
// Start new segment immediately
this.startSpeech(currentTime, energy);
}
}
// --- Speech State Machine ---
if (!this.state.inSpeech && isSpeech) {
// Transition: Silence -> Speech
const realStartIndex = this.findSpeechStart();
const realStartTime = realStartIndex !== -1
? this.state.recentChunks[realStartIndex].time
: currentTime;
this.startSpeech(realStartTime, energy);
this.log('Speech start detected', {
detectedAt: currentTime.toFixed(2),
actualStart: realStartTime.toFixed(2),
lookbackDiff: (currentTime - realStartTime).toFixed(2),
snr: snr.toFixed(2),
noiseFloor: this.state.noiseFloor.toFixed(6)
});
} else if (this.state.inSpeech && !isSpeech) {
// Transition: Speech -> potentially Silence
this.state.silenceCounter++;
const chunksNeeded = Math.ceil(this.options.silenceThreshold / (this.options.windowSize / this.options.sampleRate));
if (this.state.silenceCounter % 5 === 0) {
this.log('Silence progressing', {
counter: this.state.silenceCounter,
needed: chunksNeeded,
energy: energy.toFixed(6),
snr: snr.toFixed(2)
});
}
// Implement ending speech tolerance and max silence within speech
const silenceDuration = this.state.silenceCounter * (this.options.windowSize / this.options.sampleRate);
const isConfirmedSilence = this.state.silenceCounter >= chunksNeeded;
// Check if we should allow some silence within speech
if (silenceDuration < this.options.maxSilenceWithinSpeech) {
// Not yet enough silence to consider it a break
this.state.speechEnergies.push(energy);
} else if (isConfirmedSilence) {
// Confirmed silence - end speech segment
if (this.state.speechStartTime !== null) {
const speechDuration = currentTime - this.state.speechStartTime;
const avgEnergy = this.state.speechEnergies.length > 0
? this.state.speechEnergies.reduce((a, b) => a + b, 0) / this.state.speechEnergies.length
: 0;
this.state.speechStats.push({
startTime: this.state.speechStartTime,
endTime: currentTime,
duration: speechDuration,
avgEnergy,
energyIntegral: avgEnergy * speechDuration
});
if (this.state.speechStats.length > this.options.maxHistoryLength) {
this.state.speechStats.shift();
}
}
const segment = this.createSegment(this.state.speechStartTime!, currentTime);
if (segment) {
segments.push(segment);
}
this.startSilence(currentTime);
} else {
// Accumulate silence energies while deciding
this.state.silenceEnergies.push(energy);
}
} else {
// Continue in current state
if (this.state.inSpeech) {
this.state.speechEnergies.push(energy);
} else {
this.state.silenceEnergies.push(energy);
}
}
this.updateStats();
return segments;
}
/**
* Update noise floor using adaptive exponential moving average.
*/
private updateNoiseFloor(energy: number, isSpeech: boolean): void {
if (!isSpeech) {
// Blend between fast and normal adaptation rates based on silence duration
let adaptationRate = this.options.noiseFloorAdaptationRate;
if (this.state.silenceDuration < this.options.minBackgroundDuration) {
const blendFactor = Math.min(1, this.state.silenceDuration / this.options.minBackgroundDuration);
adaptationRate = this.options.fastAdaptationRate * (1 - blendFactor) +
this.options.noiseFloorAdaptationRate * blendFactor;
}
// Exponential moving average for noise floor tracking
this.state.noiseFloor = this.state.noiseFloor * (1 - adaptationRate) + energy * adaptationRate;
this.state.noiseFloor = Math.max(0.00001, this.state.noiseFloor);
}
// Track recent energies for analysis
this.state.recentEnergies.push(energy);
if (this.state.recentEnergies.length > 50) {
this.state.recentEnergies.shift();
}
}
/**
* Calculate Signal-to-Noise Ratio in dB.
*/
private calculateSNR(energy: number): number {
const noiseFloor = Math.max(0.0001, this.state.noiseFloor);
return 10 * Math.log10(energy / noiseFloor);
}
/**
* Start tracking a new speech segment.
*/
private startSpeech(time: number, energy: number): void {
this.state.inSpeech = true;
this.state.speechStartTime = time;
this.state.silenceCounter = 0;
this.state.speechEnergies = [energy];
this.state.silenceStartTime = null;
this.state.silenceDuration = 0;
const snr = this.calculateSNR(energy);
this.log('Speech state started', {
time: time.toFixed(2),
energy: energy.toFixed(6),
snr: snr.toFixed(2),
noiseFloor: this.state.noiseFloor.toFixed(6)
});
}
/**
* Transition to silence state.
*/
private startSilence(time: number): void {
this.state.inSpeech = false;
this.state.silenceStartTime = time;
this.state.speechStartTime = null;
this.state.silenceCounter = 0;
this.state.silenceEnergies = [];
this.state.silenceDuration = 0.001; // Avoid division by zero
this.log('Silence state started', {
time: time.toFixed(2),
noiseFloor: this.state.noiseFloor.toFixed(6)
});
}
/**
* Find the actual speech start using lookback and energy trend analysis.
*/
private findSpeechStart(): number {
const chunks = this.state.recentChunks;
const minSnrThreshold = this.options.minSnrThreshold;
// Find the most recent speech chunk
let firstSpeechIndex = 0;
for (let i = chunks.length - 1; i >= 0; i--) {
if (chunks[i].isSpeech) {
firstSpeechIndex = i;
break;
}
}
// Look for the earliest point where energy starts rising towards speech
let earliestRisingIndex = firstSpeechIndex;
let foundRisingTrend = false;
for (let i = firstSpeechIndex - 1; i >= 0; i--) {
// Check for rising energy trend
if (i < chunks.length - 1 &&
chunks[i + 1].energy > chunks[i].energy * (1 + this.options.energyRiseThreshold)) {
earliestRisingIndex = i;
foundRisingTrend = true;
}
// Stop if SNR drops significantly below threshold
if (chunks[i].snr < minSnrThreshold / 2) {
break;
}
// Limit lookback to ~500ms (assuming 80ms chunks)
if (firstSpeechIndex - i > 6) {
break;
}
}
if (foundRisingTrend) {
this.log('Found rising energy trend for speech onset', {
index: earliestRisingIndex,
time: chunks[earliestRisingIndex].time.toFixed(3),
energy: chunks[earliestRisingIndex].energy.toFixed(6),
snr: chunks[earliestRisingIndex].snr.toFixed(2)
});
return earliestRisingIndex;
}
// Check for SNR crossing
for (let i = firstSpeechIndex; i >= 0; i--) {
if (chunks[i].snr < minSnrThreshold) {
return Math.min(chunks.length - 1, i + 1);
}
}
// Default lookback
return Math.max(0, firstSpeechIndex - 4);
}
/**
* Create a segment object from start/end times.
*/
private createSegment(startTime: number, endTime: number): ProcessedSegment | null {
const duration = endTime - startTime;
if (duration <= 0) {
this.log('Skipping segment with zero/negative duration');
return null;
}
return {
startTime,
endTime,
duration
};
}
/**
* Update internal statistics.
*/
private updateStats(): void {
const stats: CurrentStats = {
silence: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
speech: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
noiseFloor: this.state.noiseFloor,
snr: this.state.recentChunks.length > 0
? this.state.recentChunks[this.state.recentChunks.length - 1].snr
: 0,
snrThreshold: this.options.snrThreshold,
minSnrThreshold: this.options.minSnrThreshold,
energyRiseThreshold: this.options.energyRiseThreshold
};
if (this.state.silenceStats.length > 0) {
stats.silence = {
avgDuration: this.average(this.state.silenceStats.map(s => s.duration)),
avgEnergy: this.average(this.state.silenceStats.map(s => s.avgEnergy)),
avgEnergyIntegral: this.average(this.state.silenceStats.map(s => s.energyIntegral))
};
}
if (this.state.speechStats.length > 0) {
stats.speech = {
avgDuration: this.average(this.state.speechStats.map(s => s.duration)),
avgEnergy: this.average(this.state.speechStats.map(s => s.avgEnergy)),
avgEnergyIntegral: this.average(this.state.speechStats.map(s => s.energyIntegral))
};
}
this.state.currentStats = stats;
}
private average(arr: number[]): number {
if (arr.length === 0) return 0;
return arr.reduce((a, b) => a + b, 0) / arr.length;
}
/**
* Get current statistics.
*/
getStats(): CurrentStats {
return this.state.currentStats;
}
/**
* Get current state info for debugging.
*/
getStateInfo(): { inSpeech: boolean; noiseFloor: number; snr: number; speechStartTime: number | null } {
return {
inSpeech: this.state.inSpeech,
noiseFloor: this.state.noiseFloor,
snr: this.state.currentStats.snr,
speechStartTime: this.state.speechStartTime
};
}
/**
* Reset all state.
*/
reset(): void {
this.state = {
inSpeech: false,
speechStartTime: null,
silenceStartTime: null,
silenceCounter: 0,
recentChunks: [],
speechEnergies: [],
silenceEnergies: [],
speechStats: [],
silenceStats: [],
currentStats: {
silence: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
speech: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
noiseFloor: 0.005,
snr: 0,
snrThreshold: this.options.snrThreshold,
minSnrThreshold: this.options.minSnrThreshold,
energyRiseThreshold: this.options.energyRiseThreshold
},
segmentCounter: 0,
noiseFloor: 0.005,
recentEnergies: [],
silenceDuration: 0
};
}
// ========================================================================
// Configuration Setters
// ========================================================================
setThreshold(threshold: number): void {
this.options.energyThreshold = threshold;
this.log('Updated energy threshold', threshold);
}
setSilenceLength(length: number): void {
this.options.silenceThreshold = length;
this.log('Updated silence threshold', length);
}
setLookbackDuration(duration: number): void {
this.options.lookbackDuration = duration;
this.log('Updated lookback duration', duration);
}
setOverlapDuration(duration: number): void {
this.options.overlapDuration = duration;
this.log('Updated overlap duration', duration);
}
setSnrThreshold(threshold: number): void {
this.options.snrThreshold = threshold;
this.log('Updated SNR threshold', threshold);
}
setMinSnrThreshold(threshold: number): void {
this.options.minSnrThreshold = threshold;
this.log('Updated minimum SNR threshold', threshold);
}
setNoiseFloorAdaptationRate(rate: number): void {
this.options.noiseFloorAdaptationRate = rate;
this.log('Updated noise floor adaptation rate', rate);
}
setFastAdaptationRate(rate: number): void {
this.options.fastAdaptationRate = rate;
this.log('Updated fast adaptation rate', rate);
}
setEnergyRiseThreshold(threshold: number): void {
this.options.energyRiseThreshold = threshold;
this.log('Updated energy rise threshold', threshold);
}
setMinBackgroundDuration(duration: number): void {
this.options.minBackgroundDuration = duration;
this.log('Updated minimum background duration', duration);
}
setMaxSegmentDuration(duration: number): void {
this.options.maxSegmentDuration = duration;
this.log('Updated maximum segment duration', duration);
}
setMinSpeechDuration(duration: number): void {
this.options.minSpeechDuration = duration;
this.log('Updated minimum speech duration', duration);
}
setMaxSilenceWithinSpeech(duration: number): void {
this.options.maxSilenceWithinSpeech = duration;
this.log('Updated max silence within speech', duration);
}
setEndingSpeechTolerance(duration: number): void {
this.options.endingSpeechTolerance = duration;
this.log('Updated ending speech tolerance', duration);
}
}