daasime's picture
Fix permission denied: store models/data outside /app mount
9109931
"""
Voice Activity Detection - detect speech segments.
"""
import torch
from typing import List, Tuple, Optional
from dataclasses import dataclass
@dataclass
class SpeechSegment:
"""A segment of speech."""
start: float
end: float
@property
def duration(self) -> float:
return self.end - self.start
class VoiceActivityDetector:
"""Detect speech segments using SpeechBrain VAD."""
def __init__(self, device: str = None):
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self._model = None
@property
def model(self):
"""Lazy load VAD model."""
if self._model is None:
from speechbrain.inference.VAD import VAD
import warnings
# Suppress the use_auth_token deprecation warning from speechbrain
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*use_auth_token.*")
import os
model_dir = os.environ.get("MODEL_DIR", "pretrained_models")
self._model = VAD.from_hparams(
source="speechbrain/vad-crdnn-libriparty",
savedir=os.path.join(model_dir, "vad"),
run_opts={"device": self.device}
)
return self._model
def detect(self, audio_path: str,
min_speech_duration: float = 0.25,
min_silence_duration: float = 0.1) -> List[SpeechSegment]:
"""
Detect speech segments in audio.
Args:
audio_path: Path to audio file
min_speech_duration: Minimum speech duration to keep
min_silence_duration: Minimum silence to consider as gap
Returns:
List of SpeechSegment objects
"""
# Use get_speech_segments which does full pipeline
boundaries = self.model.get_speech_segments(
audio_path,
large_chunk_size=30,
small_chunk_size=10,
overlap_small_chunk=True,
apply_energy_VAD=True,
double_check=True,
close_th=min_silence_duration,
len_th=min_speech_duration
)
# Convert to segments
segments = []
# boundaries is a tensor with shape [N, 2] where each row is [start, end]
if boundaries is not None and len(boundaries) > 0:
for boundary in boundaries:
start, end = float(boundary[0]), float(boundary[1])
if end - start >= min_speech_duration:
segments.append(SpeechSegment(start=start, end=end))
return segments
def detect_from_waveform(self, waveform: torch.Tensor, sample_rate: int,
min_speech_duration: float = 0.25) -> List[SpeechSegment]:
"""
Detect speech segments from waveform tensor.
Args:
waveform: Audio waveform tensor
sample_rate: Sample rate
min_speech_duration: Minimum speech duration
Returns:
List of SpeechSegment objects
"""
import tempfile
import torchaudio
import os
# Save to temp file (SpeechBrain VAD needs file path)
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
temp_path = f.name
try:
torchaudio.save(temp_path, waveform, sample_rate)
return self.detect(temp_path, min_speech_duration)
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
def get_total_speech(self, segments: List[SpeechSegment]) -> float:
"""Get total speech duration from segments."""
return sum(s.duration for s in segments)
def get_speech_ratio(self, segments: List[SpeechSegment],
total_duration: float) -> float:
"""Get ratio of speech to total duration."""
if total_duration == 0:
return 0.0
return self.get_total_speech(segments) / total_duration