mazesmazes
/

tiny-audio

@@ -24,7 +24,10 @@ def _get_device() -> str:
 class ForcedAligner:
-    """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
     _bundle = None
     _model = None
@@ -51,6 +54,107 @@ class ForcedAligner:
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
     @classmethod
     def align(
         cls,
@@ -59,21 +163,26 @@ class ForcedAligner:
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
         Args:
             audio: Audio waveform as numpy array
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         """
         import torchaudio
-        from torchaudio.functional import forced_align, merge_tokens
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
@@ -105,7 +214,8 @@ class ForcedAligner:
         # Normalize text: uppercase, keep only valid characters
         transcript = text.upper()
-        # Build tokens from transcript
         tokens = []
         for char in transcript:
             if char in dictionary:
@@ -116,35 +226,34 @@ class ForcedAligner:
         if not tokens:
             return []
-        targets = torch.tensor([tokens], dtype=torch.int32)
-        # Run forced alignment
-        # Note: forced_align is deprecated in torchaudio 2.6+ and will be removed in 2.9 (late 2025)
-        # No official replacement announced yet. See https://github.com/pytorch/audio/issues/3902
-        aligned_tokens, scores = forced_align(emission.unsqueeze(0), targets, blank=0)
-        # Use torchaudio's merge_tokens to get token spans (removes blanks and merges repeats)
-        token_spans = merge_tokens(aligned_tokens[0], scores[0])
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate
-        # Group token spans into words based on pipe separator
         words = text.split()
         word_timestamps = []
         current_word_start = None
         current_word_end = None
         word_idx = 0
-        for span in token_spans:
-            token_char = labels[span.token]
-            if token_char == "|":  # Word separator
                 if current_word_start is not None and word_idx < len(words):
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
-                            "start": current_word_start * frame_duration,
-                            "end": current_word_end * frame_duration,
                         }
                     )
                     word_idx += 1
@@ -152,16 +261,18 @@ class ForcedAligner:
                 current_word_end = None
             else:
                 if current_word_start is None:
-                    current_word_start = span.start
-                current_word_end = span.end
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
             word_timestamps.append(
                 {
                     "word": words[word_idx],
-                    "start": current_word_start * frame_duration,
-                    "end": current_word_end * frame_duration,
                 }
             )

 class ForcedAligner:
+    """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
+    Uses Viterbi trellis algorithm for optimal alignment path finding.
+    """
     _bundle = None
     _model = None
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
+    @staticmethod
+    def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
+        """Build Viterbi trellis for forced alignment.
+        The trellis is a 2D matrix where trellis[t, j] represents the log probability
+        of the most likely path that has emitted j tokens at time t.
+        Args:
+            emission: Log-softmax emission matrix of shape (num_frames, num_classes)
+            tokens: List of target token indices
+            blank_id: Index of the blank/CTC token (default 0)
+        Returns:
+            Trellis matrix of shape (num_frames + 1, num_tokens + 1)
+        """
+        num_frames = emission.size(0)
+        num_tokens = len(tokens)
+        # Initialize trellis with -inf (impossible paths)
+        trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
+        trellis[0, 0] = 0  # Start state has probability 1
+        for t in range(num_frames):
+            for j in range(num_tokens + 1):
+                # Stay in current state (emit blank)
+                if j < num_tokens + 1:
+                    stay_prob = trellis[t, j] + emission[t, blank_id]
+                else:
+                    stay_prob = -float("inf")
+                # Move to next state (emit token)
+                if j > 0:
+                    move_prob = trellis[t, j - 1] + emission[t, tokens[j - 1]]
+                else:
+                    move_prob = -float("inf")
+                trellis[t + 1, j] = max(stay_prob, move_prob)
+        return trellis
+    @staticmethod
+    def _backtrack(
+        trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
+    ) -> list[tuple[int, int, int]]:
+        """Backtrack through trellis to find optimal alignment path.
+        Args:
+            trellis: Trellis matrix from _get_trellis
+            emission: Log-softmax emission matrix
+            tokens: List of target token indices
+            blank_id: Index of the blank/CTC token
+        Returns:
+            List of (token_idx, start_frame, end_frame) tuples
+        """
+        num_frames = emission.size(0)
+        num_tokens = len(tokens)
+        # Start from the end
+        t = num_frames
+        j = num_tokens
+        path = []
+        # Backtrack to find where each token was emitted
+        while j > 0:
+            # Find the frame where token j-1 was first emitted
+            token_end = t
+            # Walk back while staying in state j (emitting blanks)
+            while t > 0:
+                stay_prob = trellis[t - 1, j] + emission[t - 1, blank_id]
+                if j > 0:
+                    move_prob = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
+                else:
+                    move_prob = -float("inf")
+                # Check if we moved into this state or stayed
+                if move_prob > stay_prob:
+                    # We moved into state j at time t-1
+                    token_start = t - 1
+                    path.append((tokens[j - 1], token_start, token_end))
+                    j -= 1
+                    t -= 1
+                    break
+                else:
+                    # We stayed in state j
+                    t -= 1
+            if t == 0 and j > 0:
+                # Handle edge case: remaining tokens at the start
+                path.append((tokens[j - 1], 0, token_end))
+                j -= 1
+        # Reverse to get chronological order
+        path.reverse()
+        return path
+    # Sub-frame offset to compensate for Wav2Vec2 convolutional look-ahead (in seconds)
+    # This makes timestamps feel more "natural" by shifting them earlier
+    OFFSET_COMPENSATION = 0.04  # 40ms
     @classmethod
     def align(
         cls,
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
+        offset_compensation: float | None = None,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
+        Uses Viterbi trellis algorithm for optimal forced alignment.
         Args:
             audio: Audio waveform as numpy array
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
+            offset_compensation: Time offset in seconds to subtract from timestamps
+                to compensate for Wav2Vec2 look-ahead (default: 0.04s / 40ms).
+                Set to 0 to disable.
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         """
         import torchaudio
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
         # Normalize text: uppercase, keep only valid characters
         transcript = text.upper()
+        # Build tokens from transcript (including word separators)
         tokens = []
         for char in transcript:
             if char in dictionary:
         if not tokens:
             return []
+        # Build Viterbi trellis and backtrack for optimal path
+        trellis = cls._get_trellis(emission, tokens, blank_id=0)
+        alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate
+        # Apply offset compensation for Wav2Vec2 look-ahead
+        offset = offset_compensation if offset_compensation is not None else cls.OFFSET_COMPENSATION
+        # Group aligned tokens into words based on pipe separator
         words = text.split()
         word_timestamps = []
         current_word_start = None
         current_word_end = None
         word_idx = 0
+        separator_id = dictionary.get("|", dictionary.get(" ", 0))
+        for token_id, start_frame, end_frame in alignment_path:
+            if token_id == separator_id:  # Word separator
                 if current_word_start is not None and word_idx < len(words):
+                    start_time = max(0.0, current_word_start * frame_duration - offset)
+                    end_time = max(0.0, current_word_end * frame_duration - offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
+                            "start": start_time,
+                            "end": end_time,
                         }
                     )
                     word_idx += 1
                 current_word_end = None
             else:
                 if current_word_start is None:
+                    current_word_start = start_frame
+                current_word_end = end_frame
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
+            start_time = max(0.0, current_word_start * frame_duration - offset)
+            end_time = max(0.0, current_word_end * frame_duration - offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],
+                    "start": start_time,
+                    "end": end_time,
                 }
             )