mazesmazes
/

tiny-audio

@@ -1,7 +1,6 @@
 """ASR pipeline for audio-to-text transcription with optional timestamps and diarization."""
 import re
-from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@@ -24,135 +23,8 @@ def _get_device() -> str:
     return "cpu"
-@dataclass
-class _AlignPoint:
-    """A point in the alignment path."""
-    token_index: int
-    time_index: int
-    score: float
-@dataclass
-class _AlignSegment:
-    """An aligned character/word segment."""
-    label: str
-    start: int
-    end: int
-    score: float
-    @property
-    def length(self):
-        return self.end - self.start
-def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
-    """Build dynamic programming trellis for CTC alignment.
-    Based on WhisperX's alignment algorithm for improved accuracy.
-    """
-    num_frame = emission.size(0)
-    num_tokens = len(tokens)
-    trellis = torch.zeros((num_frame, num_tokens))
-    trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
-    trellis[0, 1:] = -float("inf")
-    trellis[-num_tokens + 1 :, 0] = float("inf")
-    for t in range(num_frame - 1):
-        trellis[t + 1, 1:] = torch.maximum(
-            # Score for staying at the same token
-            trellis[t, 1:] + emission[t, blank_id],
-            # Score for changing to the next token
-            trellis[t, :-1] + emission[t, tokens[1:]],
-        )
-    return trellis
-def _backtrack(
-    trellis: torch.Tensor,
-    emission: torch.Tensor,
-    tokens: list[int],
-    blank_id: int = 0,
-) -> list[_AlignPoint]:
-    """Backtrack through trellis to find optimal alignment path."""
-    t, j = trellis.size(0) - 1, trellis.size(1) - 1
-    path = [_AlignPoint(j, t, emission[t, blank_id].exp().item())]
-    while j > 0:
-        assert t > 0
-        p_stay = emission[t - 1, blank_id]
-        p_change = emission[t - 1, tokens[j]]
-        stayed = trellis[t - 1, j] + p_stay
-        changed = trellis[t - 1, j - 1] + p_change
-        t -= 1
-        if changed > stayed:
-            j -= 1
-        prob = (p_change if changed > stayed else p_stay).exp().item()
-        path.append(_AlignPoint(j, t, prob))
-    while t > 0:
-        prob = emission[t - 1, blank_id].exp().item()
-        path.append(_AlignPoint(j, t - 1, prob))
-        t -= 1
-    return path[::-1]
-def _merge_repeats(path: list[_AlignPoint], transcript: str) -> list[_AlignSegment]:
-    """Merge repeated tokens into character segments."""
-    i1, i2 = 0, 0
-    segments = []
-    while i1 < len(path):
-        while i2 < len(path) and path[i1].token_index == path[i2].token_index:
-            i2 += 1
-        score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
-        segments.append(
-            _AlignSegment(
-                transcript[path[i1].token_index],
-                path[i1].time_index,
-                path[i2 - 1].time_index + 1,
-                score,
-            )
-        )
-        i1 = i2
-    return segments
-def _merge_words(segments: list[_AlignSegment], separator: str = "|") -> list[_AlignSegment]:
-    """Merge character segments into word segments."""
-    words = []
-    i1, i2 = 0, 0
-    while i1 < len(segments):
-        if i2 >= len(segments) or segments[i2].label == separator:
-            if i1 != i2:
-                segs = segments[i1:i2]
-                word = "".join([seg.label for seg in segs])
-                total_length = sum(seg.length for seg in segs)
-                score = (
-                    sum(seg.score * seg.length for seg in segs) / total_length
-                    if total_length > 0
-                    else 0
-                )
-                words.append(_AlignSegment(word, segments[i1].start, segments[i2 - 1].end, score))
-            i1 = i2 + 1
-            i2 = i1
-        else:
-            i2 += 1
-    return words
 class ForcedAligner:
-    """Forced aligner for word-level timestamps using wav2vec2.
-    Uses WhisperX-style dynamic programming alignment for improved accuracy
-    over simple CTC greedy alignment.
-    """
     _bundle = None
     _model = None
@@ -172,8 +44,7 @@ class ForcedAligner:
         if cls._model is None:
             import torchaudio
-            # Use LARGE model for better accuracy (same as WhisperX recommendation)
-            cls._bundle = torchaudio.pipelines.WAV2VEC2_ASR_LARGE_960H
             cls._model = cls._bundle.get_model().to(device)
             cls._model.eval()
             cls._labels = cls._bundle.get_labels()
@@ -186,29 +57,28 @@ class ForcedAligner:
         audio: np.ndarray,
         text: str,
         sample_rate: int = 16000,
-        _language: str = "en",
         _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
-        Uses WhisperX-style dynamic programming for improved alignment accuracy.
         Args:
             audio: Audio waveform as numpy array
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
-            _language: Language code (unused, English only)
-            _batch_size: Batch size (unused)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         """
         import torchaudio
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
-        # Convert audio to tensor
         if isinstance(audio, np.ndarray):
             waveform = torch.from_numpy(audio.copy()).float()
         else:
@@ -218,7 +88,7 @@ class ForcedAligner:
         if waveform.dim() == 1:
             waveform = waveform.unsqueeze(0)
-        # Resample if needed
         if sample_rate != cls._bundle.sample_rate:
             waveform = torchaudio.functional.resample(
                 waveform, sample_rate, cls._bundle.sample_rate
@@ -233,47 +103,67 @@ class ForcedAligner:
         emission = emissions[0].cpu()
-        # Normalize text and build token sequence
         transcript = text.upper()
         tokens = []
-        clean_transcript = ""
         for char in transcript:
             if char in dictionary:
                 tokens.append(dictionary[char])
-                clean_transcript += char
             elif char == " ":
-                sep_token = dictionary.get("|", dictionary.get(" ", 0))
-                tokens.append(sep_token)
-                clean_transcript += "|"
         if not tokens:
             return []
-        # Build trellis and find optimal path (WhisperX-style DP alignment)
-        trellis = _get_trellis(emission, tokens, blank_id=0)
-        path = _backtrack(trellis, emission, tokens, blank_id=0)
-        # Merge into character segments, then word segments
-        char_segments = _merge_repeats(path, clean_transcript)
-        word_segments = _merge_words(char_segments, separator="|")
-        # Convert frame indices to time
-        frame_duration = 320 / cls._bundle.sample_rate  # 20ms per frame
-        # Build output with original words
         words = text.split()
         word_timestamps = []
-        for i, seg in enumerate(word_segments):
-            if i < len(words):
-                word_timestamps.append(
-                    {
-                        "word": words[i],
-                        "start": seg.start * frame_duration,
-                        "end": seg.end * frame_duration,
-                    }
-                )
         return word_timestamps
@@ -339,8 +229,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
             num_speakers: Exact number of speakers (if known, for diarization)
             min_speakers: Minimum number of speakers (for diarization)
             max_speakers: Maximum number of speakers (for diarization)
-            hf_token: HuggingFace token for pyannote models (or set HF_TOKEN env var)
-            diarization_backend: Backend for diarization ("pyannote" or "local")
             **kwargs: Additional arguments passed to the pipeline
         Returns:
@@ -355,8 +243,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
             "num_speakers": kwargs.pop("num_speakers", None),
             "min_speakers": kwargs.pop("min_speakers", None),
             "max_speakers": kwargs.pop("max_speakers", None),
-            "hf_token": kwargs.pop("hf_token", None),
-            "backend": kwargs.pop("diarization_backend", "pyannote"),
         }
         if return_speakers:

 """ASR pipeline for audio-to-text transcription with optional timestamps and diarization."""
 import re
 from pathlib import Path
 from typing import Any
     return "cpu"
 class ForcedAligner:
+    """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
     _bundle = None
     _model = None
         if cls._model is None:
             import torchaudio
+            cls._bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
             cls._model = cls._bundle.get_model().to(device)
             cls._model.eval()
             cls._labels = cls._bundle.get_labels()
         audio: np.ndarray,
         text: str,
         sample_rate: int = 16000,
+        _language: str = "eng",
         _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
         Args:
             audio: Audio waveform as numpy array
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
+            _language: ISO-639-3 language code (default "eng" for English, unused)
+            _batch_size: Batch size for alignment model (unused)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         """
         import torchaudio
+        from torchaudio.functional import forced_align, merge_tokens
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
+        # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
             waveform = torch.from_numpy(audio.copy()).float()
         else:
         if waveform.dim() == 1:
             waveform = waveform.unsqueeze(0)
+        # Resample if needed (wav2vec2 expects 16kHz)
         if sample_rate != cls._bundle.sample_rate:
             waveform = torchaudio.functional.resample(
                 waveform, sample_rate, cls._bundle.sample_rate
         emission = emissions[0].cpu()
+        # Normalize text: uppercase, keep only valid characters
         transcript = text.upper()
+        # Build tokens from transcript
         tokens = []
         for char in transcript:
             if char in dictionary:
                 tokens.append(dictionary[char])
             elif char == " ":
+                tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
         if not tokens:
             return []
+        targets = torch.tensor([tokens], dtype=torch.int32)
+        # Run forced alignment
+        # Note: forced_align is deprecated in torchaudio 2.6+ and will be removed in 2.9 (late 2025)
+        # No official replacement announced yet. See https://github.com/pytorch/audio/issues/3902
+        aligned_tokens, scores = forced_align(emission.unsqueeze(0), targets, blank=0)
+        # Use torchaudio's merge_tokens to get token spans (removes blanks and merges repeats)
+        token_spans = merge_tokens(aligned_tokens[0], scores[0])
+        # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
+        frame_duration = 320 / cls._bundle.sample_rate
+        # Group token spans into words based on pipe separator
         words = text.split()
         word_timestamps = []
+        current_word_start = None
+        current_word_end = None
+        word_idx = 0
+        for span in token_spans:
+            token_char = labels[span.token]
+            if token_char == "|":  # Word separator
+                if current_word_start is not None and word_idx < len(words):
+                    word_timestamps.append(
+                        {
+                            "word": words[word_idx],
+                            "start": current_word_start * frame_duration,
+                            "end": current_word_end * frame_duration,
+                        }
+                    )
+                    word_idx += 1
+                current_word_start = None
+                current_word_end = None
+            else:
+                if current_word_start is None:
+                    current_word_start = span.start
+                current_word_end = span.end
+        # Don't forget the last word
+        if current_word_start is not None and word_idx < len(words):
+            word_timestamps.append(
+                {
+                    "word": words[word_idx],
+                    "start": current_word_start * frame_duration,
+                    "end": current_word_end * frame_duration,
+                }
+            )
         return word_timestamps
             num_speakers: Exact number of speakers (if known, for diarization)
             min_speakers: Minimum number of speakers (for diarization)
             max_speakers: Maximum number of speakers (for diarization)
             **kwargs: Additional arguments passed to the pipeline
         Returns:
             "num_speakers": kwargs.pop("num_speakers", None),
             "min_speakers": kwargs.pop("min_speakers", None),
             "max_speakers": kwargs.pop("max_speakers", None),
         }
         if return_speakers:

diarization.py CHANGED Viewed

@@ -1,8 +1,9 @@
-"""Speaker diarization with support for pyannote and local (tiny-audio) backends.
-Provides two diarization backends:
-- pyannote: Uses pyannote-audio pipeline (requires HF token with model access)
-- local: Uses TEN-VAD + ERes2NetV2 + spectral clustering (no token required)
 Spectral clustering implementation adapted from FunASR/3D-Speaker:
 https://github.com/alibaba-damo-academy/FunASR
@@ -244,12 +245,12 @@ class SpeakerClusterer:
 class LocalSpeakerDiarizer:
-    """Local speaker diarization using TEN-VAD + ERes2NetV2 + spectral clustering.
     Pipeline:
     1. TEN-VAD detects speech segments
     2. Sliding window (1.0s, 75% overlap) for uniform embedding extraction
-    3. ERes2NetV2 extracts speaker embeddings per window
     4. Spectral clustering with eigenvalue gap for auto speaker detection
     5. Frame-level consensus voting for segment reconstruction
     6. Post-processing merges short segments to reduce flicker
@@ -268,7 +269,7 @@ class LocalSpeakerDiarizer:
     """
     _ten_vad_model = None
-    _eres2netv2_model = None
     _device = None
     # ==================== TUNABLE PARAMETERS ====================
@@ -312,25 +313,21 @@ class LocalSpeakerDiarizer:
         return cls._device
     @classmethod
-    def _get_eres2netv2_model(cls):
-        """Lazy-load ERes2NetV2 speaker embedding model (singleton)."""
-        if cls._eres2netv2_model is None:
-            from modelscope.pipelines import pipeline
-            from modelscope.utils.constant import Tasks
-            sv_pipeline = pipeline(
-                task=Tasks.speaker_verification,
-                model="iic/speech_eres2netv2_sv_zh-cn_16k-common",
             )
-            cls._eres2netv2_model = sv_pipeline.model
-            # Move model to GPU if available
             device = cls._get_device()
-            cls._eres2netv2_model = cls._eres2netv2_model.to(device)
-            cls._eres2netv2_model.device = device
-            cls._eres2netv2_model.eval()
-        return cls._eres2netv2_model
     @classmethod
     def diarize(
@@ -487,7 +484,7 @@ class LocalSpeakerDiarizer:
         cls, audio_array: np.ndarray, segments: list[dict], sample_rate: int
     ) -> tuple[np.ndarray, list[dict]]:
         """Extract speaker embeddings using sliding windows."""
-        speaker_model = cls._get_eres2netv2_model()
         device = cls._get_device()
         window_samples = int(cls.WINDOW_SIZE * sample_rate)
@@ -525,9 +522,10 @@ class LocalSpeakerDiarizer:
                         pad_width = window_samples - len(chunk)
                         chunk = np.pad(chunk, (0, pad_width), mode="reflect")
-                    # Extract embedding
                     chunk_tensor = torch.from_numpy(chunk).float().unsqueeze(0).to(device)
-                    embedding = speaker_model.forward(chunk_tensor).squeeze(0).cpu().numpy()
                     # Validate and normalize
                     if not np.isfinite(embedding).all():
@@ -715,34 +713,14 @@ class LocalSpeakerDiarizer:
 class SpeakerDiarizer:
-    """Unified speaker diarization interface supporting multiple backends.
-    Backends:
-    - 'pyannote': Uses pyannote-audio pipeline (requires HF token)
-    - 'local': Uses TEN-VAD + ERes2NetV2 + spectral clustering
     Example:
-        >>> segments = SpeakerDiarizer.diarize(audio_array, backend="local")
         >>> for seg in segments:
         ...     print(f"{seg['speaker']}: {seg['start']:.2f} - {seg['end']:.2f}")
     """
-    _pyannote_pipeline = None
-    @classmethod
-    def _get_pyannote_pipeline(cls, hf_token: str | None = None):
-        """Get or create the pyannote diarization pipeline."""
-        if cls._pyannote_pipeline is None:
-            from pyannote.audio import Pipeline
-            cls._pyannote_pipeline = Pipeline.from_pretrained(
-                "pyannote/speaker-diarization-3.1",
-                token=hf_token,
-            )
-            cls._pyannote_pipeline.to(torch.device(_get_device()))
-        return cls._pyannote_pipeline
     @classmethod
     def diarize(
         cls,
@@ -751,8 +729,7 @@ class SpeakerDiarizer:
         num_speakers: int | None = None,
         min_speakers: int | None = None,
         max_speakers: int | None = None,
-        hf_token: str | None = None,
-        backend: str = "pyannote",
     ) -> list[dict]:
         """Run speaker diarization on audio.
@@ -762,87 +739,18 @@ class SpeakerDiarizer:
             num_speakers: Exact number of speakers (if known)
             min_speakers: Minimum number of speakers
             max_speakers: Maximum number of speakers
-            hf_token: HuggingFace token for pyannote models
-            backend: Diarization backend ("pyannote" or "local")
         Returns:
             List of dicts with 'speaker', 'start', 'end' keys
         """
-        if backend == "local":
-            return LocalSpeakerDiarizer.diarize(
-                audio,
-                sample_rate=sample_rate,
-                num_speakers=num_speakers,
-                min_speakers=min_speakers or 2,
-                max_speakers=max_speakers or 10,
-            )
-        # Default to pyannote
-        return cls._diarize_pyannote(
             audio,
             sample_rate=sample_rate,
             num_speakers=num_speakers,
-            min_speakers=min_speakers,
-            max_speakers=max_speakers,
-            hf_token=hf_token,
         )
-    @classmethod
-    def _diarize_pyannote(
-        cls,
-        audio: np.ndarray | str,
-        sample_rate: int = 16000,
-        num_speakers: int | None = None,
-        min_speakers: int | None = None,
-        max_speakers: int | None = None,
-        hf_token: str | None = None,
-    ) -> list[dict]:
-        """Run pyannote diarization."""
-        pipeline = cls._get_pyannote_pipeline(hf_token)
-        # Prepare audio input
-        if isinstance(audio, np.ndarray):
-            waveform = torch.from_numpy(audio.copy()).unsqueeze(0)
-            if waveform.dim() == 1:
-                waveform = waveform.unsqueeze(0)
-            audio_input = {"waveform": waveform, "sample_rate": sample_rate}
-        else:
-            audio_input = audio
-        # Run diarization
-        diarization_args = {}
-        if num_speakers is not None:
-            diarization_args["num_speakers"] = num_speakers
-        if min_speakers is not None:
-            diarization_args["min_speakers"] = min_speakers
-        if max_speakers is not None:
-            diarization_args["max_speakers"] = max_speakers
-        diarization = pipeline(audio_input, **diarization_args)
-        # Handle different pyannote return types
-        if hasattr(diarization, "itertracks"):
-            annotation = diarization
-        elif hasattr(diarization, "speaker_diarization"):
-            annotation = diarization.speaker_diarization
-        elif isinstance(diarization, tuple):
-            annotation = diarization[0]
-        else:
-            raise TypeError(f"Unexpected diarization output type: {type(diarization)}")
-        # Convert to simple format
-        segments = []
-        for turn, _, speaker in annotation.itertracks(yield_label=True):
-            segments.append(
-                {
-                    "speaker": speaker,
-                    "start": turn.start,
-                    "end": turn.end,
-                }
-            )
-        return segments
     @classmethod
     def assign_speakers_to_words(
         cls,

+"""Speaker diarization using TEN-VAD + WavLM + spectral clustering.
+Pipeline:
+1. TEN-VAD detects speech segments
+2. WavLM (microsoft/wavlm-base-plus-sv) extracts speaker embeddings
+3. Spectral clustering groups embeddings by speaker
 Spectral clustering implementation adapted from FunASR/3D-Speaker:
 https://github.com/alibaba-damo-academy/FunASR
 class LocalSpeakerDiarizer:
+    """Local speaker diarization using TEN-VAD + WavLM + spectral clustering.
     Pipeline:
     1. TEN-VAD detects speech segments
     2. Sliding window (1.0s, 75% overlap) for uniform embedding extraction
+    3. WavLM extracts speaker embeddings per window
     4. Spectral clustering with eigenvalue gap for auto speaker detection
     5. Frame-level consensus voting for segment reconstruction
     6. Post-processing merges short segments to reduce flicker
     """
     _ten_vad_model = None
+    _speaker_model = None
     _device = None
     # ==================== TUNABLE PARAMETERS ====================
         return cls._device
     @classmethod
+    def _get_speaker_model(cls):
+        """Lazy-load WavLM speaker embedding model (singleton)."""
+        if cls._speaker_model is None:
+            from transformers import WavLMForXVector
+            cls._speaker_model = WavLMForXVector.from_pretrained(
+                "microsoft/wavlm-base-plus-sv",
             )
+            # Move model to best available device (MPS/CUDA/CPU)
             device = cls._get_device()
+            cls._speaker_model = cls._speaker_model.to(device)
+            cls._speaker_model.eval()
+        return cls._speaker_model
     @classmethod
     def diarize(
         cls, audio_array: np.ndarray, segments: list[dict], sample_rate: int
     ) -> tuple[np.ndarray, list[dict]]:
         """Extract speaker embeddings using sliding windows."""
+        speaker_model = cls._get_speaker_model()
         device = cls._get_device()
         window_samples = int(cls.WINDOW_SIZE * sample_rate)
                         pad_width = window_samples - len(chunk)
                         chunk = np.pad(chunk, (0, pad_width), mode="reflect")
+                    # Extract embedding (WavLMForXVector returns XVectorOutput with .embeddings)
                     chunk_tensor = torch.from_numpy(chunk).float().unsqueeze(0).to(device)
+                    output = speaker_model(chunk_tensor)
+                    embedding = output.embeddings.squeeze(0).cpu().numpy()
                     # Validate and normalize
                     if not np.isfinite(embedding).all():
 class SpeakerDiarizer:
+    """Speaker diarization using TEN-VAD + WavLM + spectral clustering.
     Example:
+        >>> segments = SpeakerDiarizer.diarize(audio_array)
         >>> for seg in segments:
         ...     print(f"{seg['speaker']}: {seg['start']:.2f} - {seg['end']:.2f}")
     """
     @classmethod
     def diarize(
         cls,
         num_speakers: int | None = None,
         min_speakers: int | None = None,
         max_speakers: int | None = None,
+        **_kwargs,
     ) -> list[dict]:
         """Run speaker diarization on audio.
             num_speakers: Exact number of speakers (if known)
             min_speakers: Minimum number of speakers
             max_speakers: Maximum number of speakers
         Returns:
             List of dicts with 'speaker', 'start', 'end' keys
         """
+        return LocalSpeakerDiarizer.diarize(
             audio,
             sample_rate=sample_rate,
             num_speakers=num_speakers,
+            min_speakers=min_speakers or 2,
+            max_speakers=max_speakers or 10,
         )
     @classmethod
     def assign_speakers_to_words(
         cls,