mazesmazes
/

tiny-audio

@@ -1,6 +1,5 @@
 from typing import Any
-import numpy as np
 import torch
 import transformers
@@ -10,278 +9,6 @@ except ImportError:
     from asr_modeling import ASRModel  # type: ignore[no-redef]
-class ForcedAligner:
-    """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
-    _bundle = None
-    _model = None
-    _labels = None
-    _dictionary = None
-    @classmethod
-    def get_instance(cls, device: str = "cuda"):
-        if cls._model is None:
-            import torchaudio
-            cls._bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
-            cls._model = cls._bundle.get_model().to(device)
-            cls._model.eval()
-            cls._labels = cls._bundle.get_labels()
-            cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
-        return cls._model, cls._labels, cls._dictionary
-    @classmethod
-    def align(
-        cls,
-        audio: np.ndarray,
-        text: str,
-        sample_rate: int = 16000,
-        language: str = "eng",
-        batch_size: int = 16,
-    ) -> list[dict]:
-        """Align transcript to audio and return word-level timestamps.
-        Args:
-            audio: Audio waveform as numpy array
-            text: Transcript text to align
-            sample_rate: Audio sample rate (default 16000)
-            language: ISO-639-3 language code (default "eng" for English, unused)
-            batch_size: Batch size for alignment model (unused)
-        Returns:
-            List of dicts with 'word', 'start', 'end' keys
-        """
-        import torchaudio
-        from torchaudio.functional import forced_align, merge_tokens
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        model, labels, dictionary = cls.get_instance(device)
-        # Convert audio to tensor (copy to ensure array is writable)
-        if isinstance(audio, np.ndarray):
-            waveform = torch.from_numpy(audio.copy()).float()
-        else:
-            waveform = audio.clone().float()
-        # Ensure 2D (channels, time)
-        if waveform.dim() == 1:
-            waveform = waveform.unsqueeze(0)
-        # Resample if needed (wav2vec2 expects 16kHz)
-        if sample_rate != cls._bundle.sample_rate:
-            waveform = torchaudio.functional.resample(
-                waveform, sample_rate, cls._bundle.sample_rate
-            )
-        waveform = waveform.to(device)
-        # Get emissions from model
-        with torch.inference_mode():
-            emissions, _ = model(waveform)
-            emissions = torch.log_softmax(emissions, dim=-1)
-        emission = emissions[0].cpu()
-        # Normalize text: uppercase, keep only valid characters
-        transcript = text.upper()
-        # Build tokens from transcript
-        tokens = []
-        for char in transcript:
-            if char in dictionary:
-                tokens.append(dictionary[char])
-            elif char == " ":
-                tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
-        if not tokens:
-            return []
-        targets = torch.tensor([tokens], dtype=torch.int32)
-        # Run forced alignment
-        # Note: forced_align is deprecated in torchaudio 2.6+ and will be removed in 2.9 (late 2025)
-        # No official replacement announced yet. See https://github.com/pytorch/audio/issues/3902
-        aligned_tokens, scores = forced_align(emission.unsqueeze(0), targets, blank=0)
-        # Use torchaudio's merge_tokens to get token spans (removes blanks and merges repeats)
-        token_spans = merge_tokens(aligned_tokens[0], scores[0])
-        # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
-        frame_duration = 320 / cls._bundle.sample_rate
-        # Group token spans into words based on pipe separator
-        words = text.split()
-        word_timestamps = []
-        current_word_start = None
-        current_word_end = None
-        word_idx = 0
-        for span in token_spans:
-            token_char = labels[span.token]
-            if token_char == "|":  # Word separator
-                if current_word_start is not None and word_idx < len(words):
-                    word_timestamps.append({
-                        "word": words[word_idx],
-                        "start": current_word_start * frame_duration,
-                        "end": current_word_end * frame_duration,
-                    })
-                    word_idx += 1
-                current_word_start = None
-                current_word_end = None
-            else:
-                if current_word_start is None:
-                    current_word_start = span.start
-                current_word_end = span.end
-        # Don't forget the last word
-        if current_word_start is not None and word_idx < len(words):
-            word_timestamps.append({
-                "word": words[word_idx],
-                "start": current_word_start * frame_duration,
-                "end": current_word_end * frame_duration,
-            })
-        return word_timestamps
-class SpeakerDiarizer:
-    """Lazy-loaded speaker diarization using pyannote-audio."""
-    _pipeline = None
-    @classmethod
-    def get_instance(cls, hf_token: str | None = None):
-        """Get or create the diarization pipeline.
-        Args:
-            hf_token: HuggingFace token with access to pyannote models.
-                     Can also be set via HF_TOKEN environment variable.
-        """
-        if cls._pipeline is None:
-            from pyannote.audio import Pipeline
-            cls._pipeline = Pipeline.from_pretrained(
-                "pyannote/speaker-diarization-3.1",
-            )
-            # Move to GPU if available
-            if torch.cuda.is_available():
-                cls._pipeline.to(torch.device("cuda"))
-            elif torch.backends.mps.is_available():
-                cls._pipeline.to(torch.device("mps"))
-        return cls._pipeline
-    @classmethod
-    def diarize(
-        cls,
-        audio: np.ndarray | str,
-        sample_rate: int = 16000,
-        num_speakers: int | None = None,
-        min_speakers: int | None = None,
-        max_speakers: int | None = None,
-        hf_token: str | None = None,
-    ) -> list[dict]:
-        """Run speaker diarization on audio.
-        Args:
-            audio: Audio waveform as numpy array or path to audio file
-            sample_rate: Audio sample rate (default 16000)
-            num_speakers: Exact number of speakers (if known)
-            min_speakers: Minimum number of speakers
-            max_speakers: Maximum number of speakers
-            hf_token: HuggingFace token for pyannote models
-        Returns:
-            List of dicts with 'speaker', 'start', 'end' keys
-        """
-        pipeline = cls.get_instance(hf_token)
-        # Prepare audio input
-        if isinstance(audio, np.ndarray):
-            # pyannote expects {"waveform": tensor, "sample_rate": int}
-            waveform = torch.from_numpy(audio).unsqueeze(0)  # Add channel dim
-            if waveform.dim() == 1:
-                waveform = waveform.unsqueeze(0)
-            audio_input = {"waveform": waveform, "sample_rate": sample_rate}
-        else:
-            # File path
-            audio_input = audio
-        # Run diarization
-        diarization_args = {}
-        if num_speakers is not None:
-            diarization_args["num_speakers"] = num_speakers
-        if min_speakers is not None:
-            diarization_args["min_speakers"] = min_speakers
-        if max_speakers is not None:
-            diarization_args["max_speakers"] = max_speakers
-        diarization = pipeline(audio_input, **diarization_args)
-        # Handle different pyannote return types
-        # pyannote 3.x returns DiarizeOutput dataclass, older versions return Annotation
-        if hasattr(diarization, "itertracks"):
-            annotation = diarization
-        elif hasattr(diarization, "speaker_diarization"):
-            # pyannote 3.x DiarizeOutput dataclass
-            annotation = diarization.speaker_diarization
-        elif isinstance(diarization, tuple):
-            # Some versions return (annotation, embeddings) tuple
-            annotation = diarization[0]
-        else:
-            raise TypeError(f"Unexpected diarization output type: {type(diarization)}")
-        # Convert to simple format
-        segments = []
-        for turn, _, speaker in annotation.itertracks(yield_label=True):
-            segments.append({
-                "speaker": speaker,
-                "start": turn.start,
-                "end": turn.end,
-            })
-        return segments
-    @classmethod
-    def assign_speakers_to_words(
-        cls,
-        words: list[dict],
-        speaker_segments: list[dict],
-    ) -> list[dict]:
-        """Assign speaker labels to words based on timestamp overlap.
-        Args:
-            words: List of word dicts with 'word', 'start', 'end' keys
-            speaker_segments: List of speaker dicts with 'speaker', 'start', 'end' keys
-        Returns:
-            Words list with 'speaker' key added to each word
-        """
-        for word in words:
-            word_mid = (word["start"] + word["end"]) / 2
-            # Find the speaker segment that contains this word's midpoint
-            best_speaker = None
-            for seg in speaker_segments:
-                if seg["start"] <= word_mid <= seg["end"]:
-                    best_speaker = seg["speaker"]
-                    break
-            # If no exact match, find closest segment
-            if best_speaker is None and speaker_segments:
-                min_dist = float("inf")
-                for seg in speaker_segments:
-                    seg_mid = (seg["start"] + seg["end"]) / 2
-                    dist = abs(word_mid - seg_mid)
-                    if dist < min_dist:
-                        min_dist = dist
-                        best_speaker = seg["speaker"]
-            word["speaker"] = best_speaker
-        return words
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
@@ -297,131 +24,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         super().__init__(
             model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
         )
-        self._current_audio = None
-    def _sanitize_parameters(self, **kwargs):
-        """Intercept our custom parameters before parent class validates them."""
-        # Remove our custom parameters so parent doesn't see them
-        kwargs.pop("return_timestamps", None)
-        kwargs.pop("return_speakers", None)
-        kwargs.pop("num_speakers", None)
-        kwargs.pop("min_speakers", None)
-        kwargs.pop("max_speakers", None)
-        kwargs.pop("hf_token", None)
-        return super()._sanitize_parameters(**kwargs)
-    def __call__(
-        self,
-        inputs,
-        **kwargs,
-    ):
-        """Transcribe audio with optional word-level timestamps and speaker diarization.
-        Args:
-            inputs: Audio input (file path, dict with array/sampling_rate, etc.)
-            return_timestamps: If True, return word-level timestamps using forced alignment
-            return_speakers: If True, return speaker labels for each word
-            num_speakers: Exact number of speakers (if known, for diarization)
-            min_speakers: Minimum number of speakers (for diarization)
-            max_speakers: Maximum number of speakers (for diarization)
-            hf_token: HuggingFace token for pyannote models (or set HF_TOKEN env var)
-            **kwargs: Additional arguments passed to the pipeline
-        Returns:
-            Dict with 'text' key, 'words' key if return_timestamps=True,
-            and speaker labels on words if return_speakers=True
-        """
-        # Extract our params before super().__call__ (which will also call _sanitize_parameters)
-        return_timestamps = kwargs.pop("return_timestamps", False)
-        return_speakers = kwargs.pop("return_speakers", False)
-        diarization_params = {
-            "num_speakers": kwargs.pop("num_speakers", None),
-            "min_speakers": kwargs.pop("min_speakers", None),
-            "max_speakers": kwargs.pop("max_speakers", None),
-            "hf_token": kwargs.pop("hf_token", None),
-        }
-        if return_speakers:
-            return_timestamps = True
-        # Store audio for timestamp alignment and diarization
-        if return_timestamps or return_speakers:
-            self._current_audio = self._extract_audio(inputs)
-        # Run standard transcription
-        result = super().__call__(inputs, **kwargs)
-        # Add timestamps if requested
-        if return_timestamps and self._current_audio is not None:
-            text = result.get("text", "")
-            if text:
-                try:
-                    words = ForcedAligner.align(
-                        self._current_audio["array"],
-                        text,
-                        sample_rate=self._current_audio.get("sampling_rate", 16000),
-                    )
-                    result["words"] = words
-                except Exception as e:
-                    result["words"] = []
-                    result["timestamp_error"] = str(e)
-            else:
-                result["words"] = []
-        # Add speaker diarization if requested
-        if return_speakers and self._current_audio is not None:
-            try:
-                # Run diarization
-                speaker_segments = SpeakerDiarizer.diarize(
-                    self._current_audio["array"],
-                    sample_rate=self._current_audio.get("sampling_rate", 16000),
-                    **{k: v for k, v in diarization_params.items() if v is not None},
-                )
-                result["speaker_segments"] = speaker_segments
-                # Assign speakers to words
-                if result.get("words"):
-                    result["words"] = SpeakerDiarizer.assign_speakers_to_words(
-                        result["words"],
-                        speaker_segments,
-                    )
-            except Exception as e:
-                result["speaker_segments"] = []
-                result["diarization_error"] = str(e)
-        # Clean up
-        self._current_audio = None
-        return result
-    def _extract_audio(self, inputs) -> dict | None:
-        """Extract audio array from various input formats using HF utilities."""
-        from transformers.pipelines.audio_utils import ffmpeg_read
-        if isinstance(inputs, dict):
-            if "array" in inputs:
-                return {
-                    "array": inputs["array"],
-                    "sampling_rate": inputs.get("sampling_rate", 16000),
-                }
-            if "raw" in inputs:
-                return {
-                    "array": inputs["raw"],
-                    "sampling_rate": inputs.get("sampling_rate", 16000),
-                }
-        elif isinstance(inputs, str):
-            # File path - load audio using ffmpeg (same as HF pipeline)
-            with open(inputs, "rb") as f:
-                audio = ffmpeg_read(f.read(), sampling_rate=16000)
-            return {"array": audio, "sampling_rate": 16000}
-        elif isinstance(inputs, bytes):
-            audio = ffmpeg_read(inputs, sampling_rate=16000)
-            return {"array": audio, "sampling_rate": 16000}
-        elif isinstance(inputs, np.ndarray):
-            return {"array": inputs, "sampling_rate": 16000}
-        return None
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)

 from typing import Any
 import torch
 import transformers
     from asr_modeling import ASRModel  # type: ignore[no-redef]
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
         super().__init__(
             model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
         )
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)