Spaces:
Running
Running
| """ | |
| Voice Activity Detection - detect speech segments. | |
| """ | |
| import torch | |
| from typing import List, Tuple, Optional | |
| from dataclasses import dataclass | |
| class SpeechSegment: | |
| """A segment of speech.""" | |
| start: float | |
| end: float | |
| def duration(self) -> float: | |
| return self.end - self.start | |
| class VoiceActivityDetector: | |
| """Detect speech segments using SpeechBrain VAD.""" | |
| def __init__(self, device: str = None): | |
| self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') | |
| self._model = None | |
| def model(self): | |
| """Lazy load VAD model.""" | |
| if self._model is None: | |
| from speechbrain.inference.VAD import VAD | |
| import warnings | |
| # Suppress the use_auth_token deprecation warning from speechbrain | |
| with warnings.catch_warnings(): | |
| warnings.filterwarnings("ignore", message=".*use_auth_token.*") | |
| import os | |
| model_dir = os.environ.get("MODEL_DIR", "pretrained_models") | |
| self._model = VAD.from_hparams( | |
| source="speechbrain/vad-crdnn-libriparty", | |
| savedir=os.path.join(model_dir, "vad"), | |
| run_opts={"device": self.device} | |
| ) | |
| return self._model | |
| def detect(self, audio_path: str, | |
| min_speech_duration: float = 0.25, | |
| min_silence_duration: float = 0.1) -> List[SpeechSegment]: | |
| """ | |
| Detect speech segments in audio. | |
| Args: | |
| audio_path: Path to audio file | |
| min_speech_duration: Minimum speech duration to keep | |
| min_silence_duration: Minimum silence to consider as gap | |
| Returns: | |
| List of SpeechSegment objects | |
| """ | |
| # Use get_speech_segments which does full pipeline | |
| boundaries = self.model.get_speech_segments( | |
| audio_path, | |
| large_chunk_size=30, | |
| small_chunk_size=10, | |
| overlap_small_chunk=True, | |
| apply_energy_VAD=True, | |
| double_check=True, | |
| close_th=min_silence_duration, | |
| len_th=min_speech_duration | |
| ) | |
| # Convert to segments | |
| segments = [] | |
| # boundaries is a tensor with shape [N, 2] where each row is [start, end] | |
| if boundaries is not None and len(boundaries) > 0: | |
| for boundary in boundaries: | |
| start, end = float(boundary[0]), float(boundary[1]) | |
| if end - start >= min_speech_duration: | |
| segments.append(SpeechSegment(start=start, end=end)) | |
| return segments | |
| def detect_from_waveform(self, waveform: torch.Tensor, sample_rate: int, | |
| min_speech_duration: float = 0.25) -> List[SpeechSegment]: | |
| """ | |
| Detect speech segments from waveform tensor. | |
| Args: | |
| waveform: Audio waveform tensor | |
| sample_rate: Sample rate | |
| min_speech_duration: Minimum speech duration | |
| Returns: | |
| List of SpeechSegment objects | |
| """ | |
| import tempfile | |
| import torchaudio | |
| import os | |
| # Save to temp file (SpeechBrain VAD needs file path) | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: | |
| temp_path = f.name | |
| try: | |
| torchaudio.save(temp_path, waveform, sample_rate) | |
| return self.detect(temp_path, min_speech_duration) | |
| finally: | |
| if os.path.exists(temp_path): | |
| os.remove(temp_path) | |
| def get_total_speech(self, segments: List[SpeechSegment]) -> float: | |
| """Get total speech duration from segments.""" | |
| return sum(s.duration for s in segments) | |
| def get_speech_ratio(self, segments: List[SpeechSegment], | |
| total_duration: float) -> float: | |
| """Get ratio of speech to total duration.""" | |
| if total_duration == 0: | |
| return 0.0 | |
| return self.get_total_speech(segments) / total_duration | |