rikhoffbauer2
/

lyric-sync

ml-intern

Model card Files Files and versions

xet

Community

rikhoffbauer2 commited on 18 days ago

Commit

8b482f1

verified ·

1 Parent(s): 4b10521

Upload lyric_sync/transcribe.py

Browse files

Files changed (1) hide show

lyric_sync/transcribe.py +396 -0

lyric_sync/transcribe.py ADDED Viewed

	@@ -0,0 +1,396 @@

+"""
+Word-level transcription of vocal audio.
+Supports multiple backends:
+- WhisperX (recommended): Whisper transcription + wav2vec2 phoneme alignment
+- Whisper (transformers pipeline): Simpler, less precise alignment
+- Granite Speech: IBM's timestamp-capable model (experimental for singing)
+WhisperX is recommended because its two-stage approach (transcription + forced
+phoneme alignment) is more robust for singing than Whisper's attention-based
+word timestamps.
+"""
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+logger = logging.getLogger(__name__)
+@dataclass
+class TimedWord:
+    """A single word with timing information."""
+    word: str
+    start: float  # seconds
+    end: float    # seconds
+    confidence: float = 1.0
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+    def __repr__(self):
+        return f"TimedWord('{self.word}', {self.start:.3f}-{self.end:.3f})"
+@dataclass
+class TranscriptionResult:
+    """Full transcription with word-level timings."""
+    text: str
+    words: list[TimedWord] = field(default_factory=list)
+    language: str = "en"
+    @property
+    def duration(self) -> float:
+        if not self.words:
+            return 0.0
+        return self.words[-1].end - self.words[0].start
+class WhisperXTranscriber:
+    """
+    Word-level transcription using WhisperX.
+    Two-stage approach:
+    1. Whisper large-v2/v3 for text transcription (batched)
+    2. wav2vec2 phoneme model for forced word-level alignment
+    This decoupled approach is robust to the timing drift that Whisper's
+    native word_timestamps exhibit on singing (stretched syllables).
+    Reference: arxiv:2303.00747 (WhisperX paper)
+    """
+    def __init__(
+        self,
+        model_size: str = "large-v2",
+        device: str = "cuda",
+        compute_type: str = "float16",
+        language: str = "en",
+        batch_size: int = 16,
+    ):
+        """
+        Args:
+            model_size: Whisper model size. "large-v2" recommended for lyrics (per arxiv:2506.15514).
+            device: "cuda" or "cpu"
+            compute_type: "float16" (GPU) or "int8" (CPU) or "float32"
+            language: Language code for transcription
+            batch_size: Batch size for transcription (reduce if OOM)
+        """
+        self.model_size = model_size
+        self.device = device
+        self.compute_type = compute_type
+        self.language = language
+        self.batch_size = batch_size
+        self._model = None
+        self._align_model = None
+        self._align_metadata = None
+    def _load_models(self):
+        """Lazy-load WhisperX models."""
+        if self._model is not None:
+            return
+        import whisperx
+        self._model = whisperx.load_model(
+            self.model_size,
+            self.device,
+            compute_type=self.compute_type,
+            language=self.language,
+        )
+        self._align_model, self._align_metadata = whisperx.load_align_model(
+            language_code=self.language,
+            device=self.device,
+        )
+        logger.info(f"Loaded WhisperX: {self.model_size} + alignment model on {self.device}")
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        sr: int = 16000,
+        initial_prompt: str = "Song lyrics: ",
+    ) -> TranscriptionResult:
+        """
+        Transcribe audio with word-level timestamps.
+        Args:
+            audio: Mono float32 numpy array
+            sr: Sample rate (16000 for Whisper)
+            initial_prompt: Prompt to bias Whisper toward lyrics domain
+        Returns:
+            TranscriptionResult with word-level timings
+        """
+        import whisperx
+        self._load_models()
+        # WhisperX expects audio loaded via its own loader at 16kHz
+        # But we can pass raw numpy if it's already 16kHz mono float32
+        if sr != 16000:
+            import torchaudio
+            import torch
+            audio_t = torch.from_numpy(audio).unsqueeze(0)
+            audio_t = torchaudio.functional.resample(audio_t, sr, 16000)
+            audio = audio_t.squeeze(0).numpy()
+        # Step 1: Transcribe
+        result = self._model.transcribe(
+            audio,
+            batch_size=self.batch_size,
+            language=self.language,
+            chunk_length=30,  # 30s context — best for singing (arxiv:2506.15514)
+            initial_prompt=initial_prompt,
+        )
+        # Step 2: Forced word-level alignment via wav2vec2
+        result = whisperx.align(
+            result["segments"],
+            self._align_model,
+            self._align_metadata,
+            audio,
+            self.device,
+            return_char_alignments=False,
+        )
+        # Convert to our format
+        words = []
+        for ws in result.get("word_segments", []):
+            if "start" in ws and "end" in ws:
+                words.append(TimedWord(
+                    word=ws["word"].strip(),
+                    start=ws["start"],
+                    end=ws["end"],
+                    confidence=ws.get("score", 1.0),
+                ))
+        full_text = " ".join(w.word for w in words)
+        return TranscriptionResult(text=full_text, words=words, language=self.language)
+class WhisperTranscriber:
+    """
+    Simpler fallback: Whisper via transformers pipeline with word timestamps.
+    Uses Whisper's built-in cross-attention DTW for word-level timestamps.
+    Less precise than WhisperX on singing but has fewer dependencies.
+    """
+    def __init__(
+        self,
+        model_id: str = "openai/whisper-large-v3",
+        device: str = "cuda",
+        torch_dtype: str = "float16",
+    ):
+        self.model_id = model_id
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self._pipe = None
+    def _load_model(self):
+        if self._pipe is not None:
+            return
+        import torch
+        from transformers import pipeline
+        dtype_map = {"float16": torch.float16, "float32": torch.float32, "bfloat16": torch.bfloat16}
+        self._pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=self.model_id,
+            torch_dtype=dtype_map.get(self.torch_dtype, torch.float16),
+            device=self.device if self.device != "cpu" else -1,
+            model_kwargs={"attn_implementation": "sdpa"},
+        )
+        logger.info(f"Loaded Whisper pipeline: {self.model_id} on {self.device}")
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        sr: int = 16000,
+        language: str = "english",
+    ) -> TranscriptionResult:
+        """
+        Transcribe with word-level timestamps via transformers pipeline.
+        Args:
+            audio: Mono float32 numpy array at sr Hz
+            sr: Sample rate
+            language: Language for transcription
+        """
+        self._load_model()
+        result = self._pipe(
+            {"array": audio, "sampling_rate": sr},
+            return_timestamps="word",
+            generate_kwargs={
+                "language": language,
+                "task": "transcribe",
+                "condition_on_previous_text": False,  # Reduces hallucination on music
+            },
+            chunk_length_s=30,
+            stride_length_s=5,
+        )
+        words = []
+        for chunk in result.get("chunks", []):
+            text = chunk["text"].strip()
+            ts = chunk.get("timestamp", (None, None))
+            if text and ts[0] is not None and ts[1] is not None:
+                words.append(TimedWord(
+                    word=text,
+                    start=ts[0],
+                    end=ts[1],
+                ))
+        full_text = " ".join(w.word for w in words)
+        return TranscriptionResult(text=full_text, words=words, language=language[:2])
+class GraniteSpeechTranscriber:
+    """
+    Experimental: IBM Granite Speech 4.1 2B Plus with word timestamps.
+    Uses in-model [T:NNN] timestamp tokens. Promising but:
+    - Only works up to ~5 minutes in timestamp mode
+    - Trained on speech only (not singing)
+    - Only outputs word-end times (not start)
+    Reference: arxiv:2604.22817 (In-Sync paper)
+    """
+    def __init__(self, device: str = "cuda"):
+        self.device = device
+        self.model_id = "ibm-granite/granite-speech-4.1-2b-plus"
+        self._model = None
+        self._processor = None
+    def _load_model(self):
+        if self._model is not None:
+            return
+        import torch
+        from transformers import AutoModelForCausalLM, AutoProcessor
+        self._processor = AutoProcessor.from_pretrained(self.model_id)
+        self._model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+        logger.info(f"Loaded Granite Speech: {self.model_id}")
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        sr: int = 16000,
+    ) -> TranscriptionResult:
+        """
+        Transcribe with word-level end-timestamps via Granite's [T:NNN] tokens.
+        """
+        import torch
+        self._load_model()
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio": audio, "sampling_rate": sr},
+                    {"type": "text", "text": "Please transcribe the speech into written format and add word-level timestamps."},
+                ],
+            }
+        ]
+        inputs = self._processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self._model.device, dtype=torch.bfloat16)
+        output_ids = self._model.generate(**inputs, max_new_tokens=2048)
+        output_text = self._processor.decode(
+            output_ids[0][inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True,
+        )
+        words = self._parse_granite_timestamps(output_text)
+        full_text = " ".join(w.word for w in words)
+        return TranscriptionResult(text=full_text, words=words)
+    @staticmethod
+    def _parse_granite_timestamps(text: str) -> list[TimedWord]:
+        """
+        Parse Granite [T:NNN] format where NNN is centiseconds.
+        Handles 10-second rollover.
+        Format: "word1 [T:012] word2 [T:045] ..."
+        """
+        pattern = r"(\S+)\s*\[T:(\d{3})\]"
+        matches = re.findall(pattern, text)
+        words = []
+        rollover = 0
+        prev_cs = 0
+        for word_text, cs_str in matches:
+            cs = int(cs_str)
+            # Detect rollover (centiseconds resets)
+            if cs < prev_cs - 50:
+                rollover += 1
+            prev_cs = cs
+            end_time = (cs + rollover * 1000) / 100.0
+            # Granite only gives end times; estimate start from previous word's end
+            start_time = words[-1].end if words else max(0.0, end_time - 0.3)
+            if word_text != "_":  # underscore = sentence boundary marker
+                words.append(TimedWord(
+                    word=word_text,
+                    start=start_time,
+                    end=end_time,
+                ))
+        return words
+def transcribe_vocals(
+    audio: np.ndarray,
+    sr: int = 16000,
+    backend: str = "whisperx",
+    device: str = "cuda",
+    language: str = "en",
+    **kwargs,
+) -> TranscriptionResult:
+    """
+    Transcribe vocals with word-level timestamps.
+    Args:
+        audio: Mono float32 numpy array
+        sr: Sample rate
+        backend: "whisperx" (recommended), "whisper", or "granite"
+        device: "cuda" or "cpu"
+        language: Language code
+        **kwargs: Additional args passed to the backend
+    Returns:
+        TranscriptionResult with word-level timings
+    """
+    if backend == "whisperx":
+        transcriber = WhisperXTranscriber(device=device, language=language, **kwargs)
+    elif backend == "whisper":
+        transcriber = WhisperTranscriber(device=device, **kwargs)
+    elif backend == "granite":
+        transcriber = GraniteSpeechTranscriber(device=device)
+    else:
+        raise ValueError(f"Unknown backend: {backend}. Use 'whisperx', 'whisper', or 'granite'.")
+    return transcriber.transcribe(audio, sr=sr)