Spaces:

daasime
/

sop-audio-analyzer

Running

App Files Files Community

sop-audio-analyzer / src /phase1_foundation /vad.py

daasime

Fix permission denied: store models/data outside /app mount

9109931 about 2 months ago

raw

history blame contribute delete

4.13 kB

	"""
	Voice Activity Detection - detect speech segments.
	"""
	import torch
	from typing import List, Tuple, Optional
	from dataclasses import dataclass


	@dataclass
	class SpeechSegment:
	"""A segment of speech."""
	start: float
	end: float

	@property
	def duration(self) -> float:
	return self.end - self.start


	class VoiceActivityDetector:
	"""Detect speech segments using SpeechBrain VAD."""

	def __init__(self, device: str = None):
	self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
	self._model = None

	@property
	def model(self):
	"""Lazy load VAD model."""
	if self._model is None:
	from speechbrain.inference.VAD import VAD
	import warnings
	# Suppress the use_auth_token deprecation warning from speechbrain
	with warnings.catch_warnings():
	warnings.filterwarnings("ignore", message=".use_auth_token.")
	import os
	model_dir = os.environ.get("MODEL_DIR", "pretrained_models")
	self._model = VAD.from_hparams(
	source="speechbrain/vad-crdnn-libriparty",
	savedir=os.path.join(model_dir, "vad"),
	run_opts={"device": self.device}
	)
	return self._model

	def detect(self, audio_path: str,
	min_speech_duration: float = 0.25,
	min_silence_duration: float = 0.1) -> List[SpeechSegment]:
	"""
	Detect speech segments in audio.

	Args:
	audio_path: Path to audio file
	min_speech_duration: Minimum speech duration to keep
	min_silence_duration: Minimum silence to consider as gap

	Returns:
	List of SpeechSegment objects
	"""
	# Use get_speech_segments which does full pipeline
	boundaries = self.model.get_speech_segments(
	audio_path,
	large_chunk_size=30,
	small_chunk_size=10,
	overlap_small_chunk=True,
	apply_energy_VAD=True,
	double_check=True,
	close_th=min_silence_duration,
	len_th=min_speech_duration
	)

	# Convert to segments
	segments = []
	# boundaries is a tensor with shape [N, 2] where each row is [start, end]
	if boundaries is not None and len(boundaries) > 0:
	for boundary in boundaries:
	start, end = float(boundary[0]), float(boundary[1])
	if end - start >= min_speech_duration:
	segments.append(SpeechSegment(start=start, end=end))

	return segments

	def detect_from_waveform(self, waveform: torch.Tensor, sample_rate: int,
	min_speech_duration: float = 0.25) -> List[SpeechSegment]:
	"""
	Detect speech segments from waveform tensor.

	Args:
	waveform: Audio waveform tensor
	sample_rate: Sample rate
	min_speech_duration: Minimum speech duration

	Returns:
	List of SpeechSegment objects
	"""
	import tempfile
	import torchaudio
	import os

	# Save to temp file (SpeechBrain VAD needs file path)
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
	temp_path = f.name

	try:
	torchaudio.save(temp_path, waveform, sample_rate)
	return self.detect(temp_path, min_speech_duration)
	finally:
	if os.path.exists(temp_path):
	os.remove(temp_path)

	def get_total_speech(self, segments: List[SpeechSegment]) -> float:
	"""Get total speech duration from segments."""
	return sum(s.duration for s in segments)

	def get_speech_ratio(self, segments: List[SpeechSegment],
	total_duration: float) -> float:
	"""Get ratio of speech to total duration."""
	if total_duration == 0:
	return 0.0
	return self.get_total_speech(segments) / total_duration