keet-streaming / src /lib /audio /audioParams.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* Keet - Audio Processing Parameters
* Ported from legacy UI project/config/audioParams.js
*
* Contains all parameters for Voice Activity Detection (VAD) and segment processing.
* Values are sample-rate-aligned to ensure exact integer sample counts.
*/
/** Segmentation preset configuration */
export interface SegmentationPreset {
name: string;
icon: string;
speechHangover: number;
audioThreshold: number;
silenceLength: number;
}
/** All audio processing parameters */
export interface AudioParams {
// Basic VAD settings
audioThreshold: number;
silenceLength: number;
speechHangover: number;
// Advanced VAD settings
energyScale: number;
hysteresisRatio: number;
minSpeechDuration: number;
maxSilenceWithinSpeech: number;
endingSpeechTolerance: number;
endingEnergyThreshold: number;
minEnergyIntegral: number;
minEnergyPerSecond: number;
// Sample-rate-aligned timing parameters
windowDuration: number;
lookbackDuration: number;
overlapDuration: number;
// Buffer durations
recentAudioDuration: number;
visualizationDuration: number;
// SNR and Noise Floor adaptation settings
snrThreshold: number;
minSnrThreshold: number;
noiseFloorAdaptationRate: number;
fastAdaptationRate: number;
minBackgroundDuration: number;
energyRiseThreshold: number;
// Processor-specific parameters
smaLength: number;
lookbackChunks: number;
maxHistoryLength: number;
maxSegmentDuration: number;
// Adaptive energy threshold settings
useAdaptiveEnergyThresholds: boolean;
adaptiveEnergyIntegralFactor: number;
adaptiveEnergyPerSecondFactor: number;
minAdaptiveEnergyIntegral: number;
minAdaptiveEnergyPerSecond: number;
// Default sample rate
sampleRate: number;
}
/**
* Segmentation presets for different use cases
*/
export const segmentationPresets: Record<'fast' | 'medium' | 'slow', SegmentationPreset> = {
fast: {
name: 'Fast (Short Segments)',
icon: 'bolt',
speechHangover: 0.08,
audioThreshold: 0.120, // Higher threshold
silenceLength: 0.1 // Short silence duration (2 windows)
},
medium: {
name: 'Medium (Balanced)',
icon: 'av_timer',
speechHangover: 0.16,
audioThreshold: 0.08, // Medium threshold
silenceLength: 0.4 // Medium silence duration (5 windows)
},
slow: {
name: 'Slow (Long Segments)',
icon: 'hourglass_bottom',
speechHangover: 0.24,
audioThreshold: 0.06, // Lower threshold (original default)
silenceLength: 1.0 // Long silence duration (10 windows)
}
};
// ============================================================================
// Default Parameter Values (derived from 'medium' preset)
// ============================================================================
// Basic VAD settings - Derived from 'medium' preset
export const audioThreshold = segmentationPresets.medium.audioThreshold;
export const silenceLength = segmentationPresets.medium.silenceLength;
export const speechHangover = segmentationPresets.medium.speechHangover;
// Advanced VAD settings
export const energyScale = 2.0; // Scaling factor for energy calculation
export const hysteresisRatio = 1.2; // Hysteresis ratio for threshold comparison
export const minSpeechDuration = 0.240; // 240ms minimum speech duration (3 * 80ms)
export const maxSilenceWithinSpeech = 0.160; // 160ms max silence within speech (2 * 80ms)
export const endingSpeechTolerance = 0.240; // 240ms tolerance for ending speech
export const endingEnergyThreshold = 0.600; // Threshold multiplier for ending speech detection
export const minEnergyIntegral = 22; // Minimum energy integral for speech detection
export const minEnergyPerSecond = 5; // Minimum energy per second for speech detection
// Adaptive energy threshold settings
export const useAdaptiveEnergyThresholds = true;
export const adaptiveEnergyIntegralFactor = 25.0; // Multiplier for noise floor to get integral threshold
export const adaptiveEnergyPerSecondFactor = 10.0; // Multiplier for noise floor to get per-second threshold
export const minAdaptiveEnergyIntegral = 3; // Floor for the adaptive threshold
export const minAdaptiveEnergyPerSecond = 1; // Floor for the adaptive threshold
// Sample-rate-aligned timing parameters
export const windowDuration = 0.080; // 80ms window - Perfectly divisible by common sample rates
export const lookbackDuration = 0.120; // 120ms lookback - Perfectly divisible by common sample rates
export const overlapDuration = 0.080; // 80ms overlap - Perfectly divisible by common sample rates
// Buffer durations
export const recentAudioDuration = 3.0; // 3 seconds of recent audio storage
export const visualizationDuration = 30.0; // 30 seconds of visualization buffer
// SNR and Noise Floor adaptation settings
export const snrThreshold = 3.0; // SNR threshold in dB for speech detection
export const minSnrThreshold = 1.0; // Minimum SNR threshold for low energy speech
export const noiseFloorAdaptationRate = 0.05; // Standard adaptation rate for noise floor (0-1)
export const fastAdaptationRate = 0.15; // Fast adaptation rate for initial calibration
export const minBackgroundDuration = 1.0; // Minimum silence duration to be "background" for fast adaptation
export const energyRiseThreshold = 0.08; // Threshold for detecting rising energy trend
// Processor-specific parameters
export const smaLength = 6; // Length of Simple Moving Average for energy smoothing
export const lookbackChunks = 3; // Number of chunks to look back for speech start
export const maxHistoryLength = 20; // Max length for storing speech/silence stats history
export const maxSegmentDuration = 4.8; // Automatically split segments longer than this (seconds)
export const sampleRate = 16000; // Default sample rate for Parakeet models
/**
* Get sample counts for different parameters at a given sample rate
*/
export function getSampleCounts(sampleRate: number): {
windowSamples: number;
lookbackSamples: number;
overlapSamples: number;
recentAudioSamples: number;
visualizationSamples: number;
minSpeechSamples: number;
silenceSamples: number;
} {
return {
windowSamples: Math.round(windowDuration * sampleRate),
lookbackSamples: Math.round(lookbackDuration * sampleRate),
overlapSamples: Math.round(overlapDuration * sampleRate),
recentAudioSamples: Math.round(recentAudioDuration * sampleRate),
visualizationSamples: Math.round(visualizationDuration * sampleRate),
minSpeechSamples: Math.round(minSpeechDuration * sampleRate),
silenceSamples: Math.round(silenceLength * sampleRate)
};
}
/**
* Default audio parameters object
*/
export const defaultAudioParams: AudioParams = {
audioThreshold,
silenceLength,
speechHangover,
energyScale,
hysteresisRatio,
minSpeechDuration,
maxSilenceWithinSpeech,
endingSpeechTolerance,
endingEnergyThreshold,
minEnergyIntegral,
minEnergyPerSecond,
windowDuration,
lookbackDuration,
overlapDuration,
recentAudioDuration,
visualizationDuration,
snrThreshold,
minSnrThreshold,
noiseFloorAdaptationRate,
fastAdaptationRate,
minBackgroundDuration,
energyRiseThreshold,
smaLength,
lookbackChunks,
maxHistoryLength,
maxSegmentDuration,
useAdaptiveEnergyThresholds,
adaptiveEnergyIntegralFactor,
adaptiveEnergyPerSecondFactor,
minAdaptiveEnergyIntegral,
minAdaptiveEnergyPerSecond,
sampleRate
};
export default defaultAudioParams;