mazesmazes
/

tiny-audio

@@ -282,160 +282,6 @@ class SpeakerDiarizer:
         return words
-class VoiceActivityDetector:
-    """Voice Activity Detection using pyannote for improved transcription quality.
-    Based on WhisperX implementation. Detects speech regions in audio and chunks
-    them for more accurate transcription of long audio files.
-    """
-    _model = None
-    _pipeline = None
-    @classmethod
-    def get_instance(cls, vad_onset: float = 0.5, vad_offset: float = 0.363):
-        """Get or create the VAD pipeline instance.
-        Args:
-            vad_onset: Threshold for speech start detection (default 0.5)
-            vad_offset: Threshold for speech end detection (default 0.363)
-        """
-        if cls._pipeline is None:
-            from pyannote.audio import Model
-            from pyannote.audio.pipelines import VoiceActivityDetection
-            # Load the segmentation model
-            cls._model = Model.from_pretrained(
-                "pyannote/segmentation-3.0",
-            )
-            # Create VAD pipeline with hyperparameters
-            cls._pipeline = VoiceActivityDetection(segmentation=cls._model)
-            cls._pipeline.instantiate({
-                "onset": vad_onset,
-                "offset": vad_offset,
-                "min_duration_on": 0.1,  # Min speech duration (100ms)
-                "min_duration_off": 0.1,  # Min silence duration (100ms)
-            })
-            # Move to GPU if available
-            if torch.cuda.is_available():
-                cls._pipeline.to(torch.device("cuda"))
-            elif torch.backends.mps.is_available():
-                cls._pipeline.to(torch.device("mps"))
-        return cls._pipeline
-    @classmethod
-    def detect(
-        cls,
-        audio: np.ndarray,
-        sample_rate: int = 16000,
-        vad_onset: float = 0.5,
-        vad_offset: float = 0.363,
-    ) -> list[dict]:
-        """Detect speech regions in audio.
-        Args:
-            audio: Audio waveform as numpy array
-            sample_rate: Audio sample rate (default 16000)
-            vad_onset: Threshold for speech start detection
-            vad_offset: Threshold for speech end detection
-        Returns:
-            List of dicts with 'start', 'end' keys (in seconds)
-        """
-        pipeline = cls.get_instance(vad_onset, vad_offset)
-        # Prepare audio input
-        waveform = torch.from_numpy(audio).float()
-        if waveform.dim() == 1:
-            waveform = waveform.unsqueeze(0)
-        audio_input = {"waveform": waveform, "sample_rate": sample_rate}
-        # Run VAD
-        vad_result = pipeline(audio_input)
-        # Convert to list of segments
-        segments = []
-        for speech_turn in vad_result.get_timeline():
-            segments.append({
-                "start": speech_turn.start,
-                "end": speech_turn.end,
-            })
-        return segments
-    @classmethod
-    def merge_chunks(
-        cls,
-        segments: list[dict],
-        chunk_size: float = 30.0,
-    ) -> list[dict]:
-        """Merge VAD segments into larger chunks for batched processing.
-        Args:
-            segments: List of VAD segments with 'start', 'end' keys
-            chunk_size: Maximum chunk duration in seconds (default 30)
-        Returns:
-            List of chunks with 'start', 'end', 'segments' keys
-        """
-        if not segments:
-            return []
-        merged = []
-        curr_start = segments[0]["start"]
-        curr_end = segments[0]["end"]
-        curr_segments = []
-        for seg in segments:
-            # If adding this segment exceeds chunk_size, finalize current chunk
-            if seg["end"] - curr_start > chunk_size and curr_segments:
-                merged.append({
-                    "start": curr_start,
-                    "end": curr_end,
-                    "segments": curr_segments,
-                })
-                curr_start = seg["start"]
-                curr_segments = []
-            curr_end = seg["end"]
-            curr_segments.append((seg["start"], seg["end"]))
-        # Add final chunk
-        if curr_segments:
-            merged.append({
-                "start": curr_start,
-                "end": curr_end,
-                "segments": curr_segments,
-            })
-        return merged
-    @classmethod
-    def extract_chunk_audio(
-        cls,
-        audio: np.ndarray,
-        chunk: dict,
-        sample_rate: int = 16000,
-    ) -> np.ndarray:
-        """Extract audio for a specific chunk.
-        Args:
-            audio: Full audio waveform
-            chunk: Chunk dict with 'start', 'end' keys
-            sample_rate: Audio sample rate
-        Returns:
-            Audio chunk as numpy array
-        """
-        start_sample = int(chunk["start"] * sample_rate)
-        end_sample = int(chunk["end"] * sample_rate)
-        return audio[start_sample:end_sample]
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
@@ -462,10 +308,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         kwargs.pop("min_speakers", None)
         kwargs.pop("max_speakers", None)
         kwargs.pop("hf_token", None)
-        kwargs.pop("use_vad", None)
-        kwargs.pop("vad_onset", None)
-        kwargs.pop("vad_offset", None)
-        kwargs.pop("chunk_size", None)
         return super()._sanitize_parameters(**kwargs)
@@ -474,14 +316,10 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         inputs,
         **kwargs,
     ):
-        """Transcribe audio with optional VAD, timestamps, and speaker diarization.
         Args:
             inputs: Audio input (file path, dict with array/sampling_rate, etc.)
-            use_vad: If True, use Voice Activity Detection to chunk audio (recommended for long audio)
-            vad_onset: VAD speech start threshold (default 0.5)
-            vad_offset: VAD speech end threshold (default 0.363)
-            chunk_size: Maximum chunk duration in seconds for VAD (default 30)
             return_timestamps: If True, return word-level timestamps using forced alignment
             return_speakers: If True, return speaker labels for each word
             num_speakers: Exact number of speakers (if known, for diarization)
@@ -492,13 +330,9 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         Returns:
             Dict with 'text' key, 'words' key if return_timestamps=True,
-            'vad_segments' if use_vad=True, and speaker labels on words if return_speakers=True
         """
         # Extract our params before super().__call__ (which will also call _sanitize_parameters)
-        use_vad = kwargs.pop("use_vad", False)
-        vad_onset = kwargs.pop("vad_onset", 0.5)
-        vad_offset = kwargs.pop("vad_offset", 0.363)
-        chunk_size = kwargs.pop("chunk_size", 30.0)
         return_timestamps = kwargs.pop("return_timestamps", False)
         return_speakers = kwargs.pop("return_speakers", False)
         diarization_params = {
@@ -511,25 +345,12 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         if return_speakers:
             return_timestamps = True
-        # Extract audio for VAD, timestamps, and diarization
-        audio_data = self._extract_audio(inputs)
-        # Use VAD to chunk and transcribe long audio
-        if use_vad and audio_data is not None:
-            result = self._transcribe_with_vad(
-                audio_data,
-                vad_onset=vad_onset,
-                vad_offset=vad_offset,
-                chunk_size=chunk_size,
-                **kwargs,
-            )
-        else:
-            # Store audio for timestamp alignment and diarization
-            if return_timestamps or return_speakers:
-                self._current_audio = audio_data
-            # Run standard transcription
-            result = super().__call__(inputs, **kwargs)
         # Add timestamps if requested
         if return_timestamps and self._current_audio is not None:
@@ -602,75 +423,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         return None
-    def _transcribe_with_vad(
-        self,
-        audio_data: dict,
-        vad_onset: float = 0.5,
-        vad_offset: float = 0.363,
-        chunk_size: float = 30.0,
-        **kwargs,
-    ) -> dict:
-        """Transcribe audio using VAD to chunk long audio.
-        Args:
-            audio_data: Dict with 'array' and 'sampling_rate' keys
-            vad_onset: VAD speech start threshold
-            vad_offset: VAD speech end threshold
-            chunk_size: Maximum chunk duration in seconds
-            **kwargs: Additional arguments passed to transcription
-        Returns:
-            Dict with 'text', 'vad_segments', and 'chunks' keys
-        """
-        audio = audio_data["array"]
-        sample_rate = audio_data.get("sampling_rate", 16000)
-        # Run VAD to detect speech regions
-        vad_segments = VoiceActivityDetector.detect(
-            audio,
-            sample_rate=sample_rate,
-            vad_onset=vad_onset,
-            vad_offset=vad_offset,
-        )
-        if not vad_segments:
-            return {"text": "", "vad_segments": [], "chunks": []}
-        # Merge segments into chunks
-        chunks = VoiceActivityDetector.merge_chunks(vad_segments, chunk_size)
-        # Transcribe each chunk
-        all_text = []
-        chunk_results = []
-        for chunk in chunks:
-            # Extract chunk audio
-            chunk_audio = VoiceActivityDetector.extract_chunk_audio(
-                audio, chunk, sample_rate
-            )
-            # Transcribe chunk
-            chunk_input = {"raw": chunk_audio, "sampling_rate": sample_rate}
-            chunk_result = super().__call__(chunk_input, **kwargs)
-            chunk_text = chunk_result.get("text", "").strip()
-            all_text.append(chunk_text)
-            chunk_results.append({
-                "start": chunk["start"],
-                "end": chunk["end"],
-                "text": chunk_text,
-            })
-        # Store audio for potential timestamp/diarization
-        self._current_audio = audio_data
-        return {
-            "text": " ".join(all_text),
-            "vad_segments": vad_segments,
-            "chunks": chunk_results,
-        }
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)
         if isinstance(inputs, dict) and "array" in inputs:

         return words
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
         kwargs.pop("min_speakers", None)
         kwargs.pop("max_speakers", None)
         kwargs.pop("hf_token", None)
         return super()._sanitize_parameters(**kwargs)
         inputs,
         **kwargs,
     ):
+        """Transcribe audio with optional word-level timestamps and speaker diarization.
         Args:
             inputs: Audio input (file path, dict with array/sampling_rate, etc.)
             return_timestamps: If True, return word-level timestamps using forced alignment
             return_speakers: If True, return speaker labels for each word
             num_speakers: Exact number of speakers (if known, for diarization)
         Returns:
             Dict with 'text' key, 'words' key if return_timestamps=True,
+            and speaker labels on words if return_speakers=True
         """
         # Extract our params before super().__call__ (which will also call _sanitize_parameters)
         return_timestamps = kwargs.pop("return_timestamps", False)
         return_speakers = kwargs.pop("return_speakers", False)
         diarization_params = {
         if return_speakers:
             return_timestamps = True
+        # Store audio for timestamp alignment and diarization
+        if return_timestamps or return_speakers:
+            self._current_audio = self._extract_audio(inputs)
+        # Run standard transcription
+        result = super().__call__(inputs, **kwargs)
         # Add timestamps if requested
         if return_timestamps and self._current_audio is not None:
         return None
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)
         if isinstance(inputs, dict) and "array" in inputs: