mazesmazes
/

tiny-audio

@@ -282,6 +282,160 @@ class SpeakerDiarizer:
         return words
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
@@ -308,6 +462,10 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         kwargs.pop("min_speakers", None)
         kwargs.pop("max_speakers", None)
         kwargs.pop("hf_token", None)
         return super()._sanitize_parameters(**kwargs)
@@ -316,10 +474,14 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         inputs,
         **kwargs,
     ):
-        """Transcribe audio with optional word-level timestamps and speaker diarization.
         Args:
             inputs: Audio input (file path, dict with array/sampling_rate, etc.)
             return_timestamps: If True, return word-level timestamps using forced alignment
             return_speakers: If True, return speaker labels for each word
             num_speakers: Exact number of speakers (if known, for diarization)
@@ -330,9 +492,13 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         Returns:
             Dict with 'text' key, 'words' key if return_timestamps=True,
-            and speaker labels on words if return_speakers=True
         """
         # Extract our params before super().__call__ (which will also call _sanitize_parameters)
         return_timestamps = kwargs.pop("return_timestamps", False)
         return_speakers = kwargs.pop("return_speakers", False)
         diarization_params = {
@@ -345,12 +511,25 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         if return_speakers:
             return_timestamps = True
-        # Store audio for timestamp alignment and diarization
-        if return_timestamps or return_speakers:
-            self._current_audio = self._extract_audio(inputs)
-        # Run standard transcription
-        result = super().__call__(inputs, **kwargs)
         # Add timestamps if requested
         if return_timestamps and self._current_audio is not None:
@@ -423,6 +602,75 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         return None
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)
         if isinstance(inputs, dict) and "array" in inputs:

         return words
+class VoiceActivityDetector:
+    """Voice Activity Detection using pyannote for improved transcription quality.
+    Based on WhisperX implementation. Detects speech regions in audio and chunks
+    them for more accurate transcription of long audio files.
+    """
+    _model = None
+    _pipeline = None
+    @classmethod
+    def get_instance(cls, vad_onset: float = 0.5, vad_offset: float = 0.363):
+        """Get or create the VAD pipeline instance.
+        Args:
+            vad_onset: Threshold for speech start detection (default 0.5)
+            vad_offset: Threshold for speech end detection (default 0.363)
+        """
+        if cls._pipeline is None:
+            from pyannote.audio import Model
+            from pyannote.audio.pipelines import VoiceActivityDetection
+            # Load the segmentation model
+            cls._model = Model.from_pretrained(
+                "pyannote/segmentation-3.0",
+            )
+            # Create VAD pipeline with hyperparameters
+            cls._pipeline = VoiceActivityDetection(segmentation=cls._model)
+            cls._pipeline.instantiate({
+                "onset": vad_onset,
+                "offset": vad_offset,
+                "min_duration_on": 0.1,  # Min speech duration (100ms)
+                "min_duration_off": 0.1,  # Min silence duration (100ms)
+            })
+            # Move to GPU if available
+            if torch.cuda.is_available():
+                cls._pipeline.to(torch.device("cuda"))
+            elif torch.backends.mps.is_available():
+                cls._pipeline.to(torch.device("mps"))
+        return cls._pipeline
+    @classmethod
+    def detect(
+        cls,
+        audio: np.ndarray,
+        sample_rate: int = 16000,
+        vad_onset: float = 0.5,
+        vad_offset: float = 0.363,
+    ) -> list[dict]:
+        """Detect speech regions in audio.
+        Args:
+            audio: Audio waveform as numpy array
+            sample_rate: Audio sample rate (default 16000)
+            vad_onset: Threshold for speech start detection
+            vad_offset: Threshold for speech end detection
+        Returns:
+            List of dicts with 'start', 'end' keys (in seconds)
+        """
+        pipeline = cls.get_instance(vad_onset, vad_offset)
+        # Prepare audio input
+        waveform = torch.from_numpy(audio).float()
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        audio_input = {"waveform": waveform, "sample_rate": sample_rate}
+        # Run VAD
+        vad_result = pipeline(audio_input)
+        # Convert to list of segments
+        segments = []
+        for speech_turn in vad_result.get_timeline():
+            segments.append({
+                "start": speech_turn.start,
+                "end": speech_turn.end,
+            })
+        return segments
+    @classmethod
+    def merge_chunks(
+        cls,
+        segments: list[dict],
+        chunk_size: float = 30.0,
+    ) -> list[dict]:
+        """Merge VAD segments into larger chunks for batched processing.
+        Args:
+            segments: List of VAD segments with 'start', 'end' keys
+            chunk_size: Maximum chunk duration in seconds (default 30)
+        Returns:
+            List of chunks with 'start', 'end', 'segments' keys
+        """
+        if not segments:
+            return []
+        merged = []
+        curr_start = segments[0]["start"]
+        curr_end = segments[0]["end"]
+        curr_segments = []
+        for seg in segments:
+            # If adding this segment exceeds chunk_size, finalize current chunk
+            if seg["end"] - curr_start > chunk_size and curr_segments:
+                merged.append({
+                    "start": curr_start,
+                    "end": curr_end,
+                    "segments": curr_segments,
+                })
+                curr_start = seg["start"]
+                curr_segments = []
+            curr_end = seg["end"]
+            curr_segments.append((seg["start"], seg["end"]))
+        # Add final chunk
+        if curr_segments:
+            merged.append({
+                "start": curr_start,
+                "end": curr_end,
+                "segments": curr_segments,
+            })
+        return merged
+    @classmethod
+    def extract_chunk_audio(
+        cls,
+        audio: np.ndarray,
+        chunk: dict,
+        sample_rate: int = 16000,
+    ) -> np.ndarray:
+        """Extract audio for a specific chunk.
+        Args:
+            audio: Full audio waveform
+            chunk: Chunk dict with 'start', 'end' keys
+            sample_rate: Audio sample rate
+        Returns:
+            Audio chunk as numpy array
+        """
+        start_sample = int(chunk["start"] * sample_rate)
+        end_sample = int(chunk["end"] * sample_rate)
+        return audio[start_sample:end_sample]
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
         kwargs.pop("min_speakers", None)
         kwargs.pop("max_speakers", None)
         kwargs.pop("hf_token", None)
+        kwargs.pop("use_vad", None)
+        kwargs.pop("vad_onset", None)
+        kwargs.pop("vad_offset", None)
+        kwargs.pop("chunk_size", None)
         return super()._sanitize_parameters(**kwargs)
         inputs,
         **kwargs,
     ):
+        """Transcribe audio with optional VAD, timestamps, and speaker diarization.
         Args:
             inputs: Audio input (file path, dict with array/sampling_rate, etc.)
+            use_vad: If True, use Voice Activity Detection to chunk audio (recommended for long audio)
+            vad_onset: VAD speech start threshold (default 0.5)
+            vad_offset: VAD speech end threshold (default 0.363)
+            chunk_size: Maximum chunk duration in seconds for VAD (default 30)
             return_timestamps: If True, return word-level timestamps using forced alignment
             return_speakers: If True, return speaker labels for each word
             num_speakers: Exact number of speakers (if known, for diarization)
         Returns:
             Dict with 'text' key, 'words' key if return_timestamps=True,
+            'vad_segments' if use_vad=True, and speaker labels on words if return_speakers=True
         """
         # Extract our params before super().__call__ (which will also call _sanitize_parameters)
+        use_vad = kwargs.pop("use_vad", False)
+        vad_onset = kwargs.pop("vad_onset", 0.5)
+        vad_offset = kwargs.pop("vad_offset", 0.363)
+        chunk_size = kwargs.pop("chunk_size", 30.0)
         return_timestamps = kwargs.pop("return_timestamps", False)
         return_speakers = kwargs.pop("return_speakers", False)
         diarization_params = {
         if return_speakers:
             return_timestamps = True
+        # Extract audio for VAD, timestamps, and diarization
+        audio_data = self._extract_audio(inputs)
+        # Use VAD to chunk and transcribe long audio
+        if use_vad and audio_data is not None:
+            result = self._transcribe_with_vad(
+                audio_data,
+                vad_onset=vad_onset,
+                vad_offset=vad_offset,
+                chunk_size=chunk_size,
+                **kwargs,
+            )
+        else:
+            # Store audio for timestamp alignment and diarization
+            if return_timestamps or return_speakers:
+                self._current_audio = audio_data
+            # Run standard transcription
+            result = super().__call__(inputs, **kwargs)
         # Add timestamps if requested
         if return_timestamps and self._current_audio is not None:
         return None
+    def _transcribe_with_vad(
+        self,
+        audio_data: dict,
+        vad_onset: float = 0.5,
+        vad_offset: float = 0.363,
+        chunk_size: float = 30.0,
+        **kwargs,
+    ) -> dict:
+        """Transcribe audio using VAD to chunk long audio.
+        Args:
+            audio_data: Dict with 'array' and 'sampling_rate' keys
+            vad_onset: VAD speech start threshold
+            vad_offset: VAD speech end threshold
+            chunk_size: Maximum chunk duration in seconds
+            **kwargs: Additional arguments passed to transcription
+        Returns:
+            Dict with 'text', 'vad_segments', and 'chunks' keys
+        """
+        audio = audio_data["array"]
+        sample_rate = audio_data.get("sampling_rate", 16000)
+        # Run VAD to detect speech regions
+        vad_segments = VoiceActivityDetector.detect(
+            audio,
+            sample_rate=sample_rate,
+            vad_onset=vad_onset,
+            vad_offset=vad_offset,
+        )
+        if not vad_segments:
+            return {"text": "", "vad_segments": [], "chunks": []}
+        # Merge segments into chunks
+        chunks = VoiceActivityDetector.merge_chunks(vad_segments, chunk_size)
+        # Transcribe each chunk
+        all_text = []
+        chunk_results = []
+        for chunk in chunks:
+            # Extract chunk audio
+            chunk_audio = VoiceActivityDetector.extract_chunk_audio(
+                audio, chunk, sample_rate
+            )
+            # Transcribe chunk
+            chunk_input = {"raw": chunk_audio, "sampling_rate": sample_rate}
+            chunk_result = super().__call__(chunk_input, **kwargs)
+            chunk_text = chunk_result.get("text", "").strip()
+            all_text.append(chunk_text)
+            chunk_results.append({
+                "start": chunk["start"],
+                "end": chunk["end"],
+                "text": chunk_text,
+            })
+        # Store audio for potential timestamp/diarization
+        self._current_audio = audio_data
+        return {
+            "text": " ".join(all_text),
+            "vad_segments": vad_segments,
+            "chunks": chunk_results,
+        }
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)
         if isinstance(inputs, dict) and "array" in inputs: