mazesmazes
/

tiny-audio

@@ -1,10 +1,13 @@
 """Forced alignment for word-level timestamps using Wav2Vec2."""
 import numpy as np
 import torch
-# Wildcard token ID for out-of-vocabulary characters
-WILDCARD_TOKEN = -1
 # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
 # Calibrated on librispeech-alignments dataset
@@ -21,6 +24,25 @@ def _get_device() -> str:
     return "cpu"
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
@@ -52,30 +74,6 @@ class ForcedAligner:
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
-    @staticmethod
-    def _get_emission_score(
-        emission: torch.Tensor, token: int, blank_id: int = 0
-    ) -> torch.Tensor:
-        """Get emission score for a token, handling wildcards.
-        For wildcard tokens (WILDCARD_TOKEN), returns the max score over all
-        non-blank tokens - allowing any character to match.
-        Args:
-            emission: Emission vector for a single frame (num_classes,)
-            token: Token index, or WILDCARD_TOKEN for out-of-vocabulary chars
-            blank_id: Index of the blank/CTC token
-        Returns:
-            Emission score (scalar tensor)
-        """
-        if token == WILDCARD_TOKEN:
-            # Wildcard: take max over all non-blank tokens
-            mask = torch.ones(emission.size(0), dtype=torch.bool)
-            mask[blank_id] = False
-            return emission[mask].max()
-        return emission[token]
     @staticmethod
     def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
         """Build trellis for forced alignment using forward algorithm.
@@ -85,7 +83,7 @@ class ForcedAligner:
         Args:
             emission: Log-softmax emission matrix of shape (num_frames, num_classes)
-            tokens: List of target token indices (WILDCARD_TOKEN for OOV chars)
             blank_id: Index of the blank/CTC token (default 0)
         Returns:
@@ -103,13 +101,7 @@ class ForcedAligner:
                 stay = trellis[t, j] + emission[t, blank_id]
                 # Move: emit token j and advance to j+1 tokens
-                if j > 0:
-                    token_score = ForcedAligner._get_emission_score(
-                        emission[t], tokens[j - 1], blank_id
-                    )
-                    move = trellis[t, j - 1] + token_score
-                else:
-                    move = -float("inf")
                 trellis[t + 1, j] = max(stay, move)  # Viterbi: take best path
@@ -154,10 +146,7 @@ class ForcedAligner:
         while t > 0 and j > 0:
             # Check: did we transition from j-1 to j at frame t-1?
             stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
-            token_score = ForcedAligner._get_emission_score(
-                emission[t - 1], tokens[j - 1], blank_id
-            )
-            move_score = trellis[t - 1, j - 1] + token_score
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
@@ -189,6 +178,148 @@ class ForcedAligner:
         return token_spans
     @classmethod
     def align(
         cls,
@@ -243,27 +374,29 @@ class ForcedAligner:
         emission = emissions[0].cpu()
-        # Normalize text: uppercase
         transcript = text.upper()
         # Build tokens from transcript (including word separators)
-        # Unknown characters get WILDCARD_TOKEN which matches any non-blank emission
         tokens = []
         for char in transcript:
             if char in dictionary:
                 tokens.append(dictionary[char])
             elif char == " ":
                 tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
-            else:
-                # Out-of-vocabulary character - use wildcard
-                tokens.append(WILDCARD_TOKEN)
         if not tokens:
             return []
         # Build Viterbi trellis and backtrack for optimal path
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
-        alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate

 """Forced alignment for word-level timestamps using Wav2Vec2."""
+import math
+from dataclasses import dataclass
 import numpy as np
 import torch
+# Beam search width for backtracking (from WhisperX)
+BEAM_WIDTH = 2
 # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
 # Calibrated on librispeech-alignments dataset
     return "cpu"
+@dataclass
+class Point:
+    """A point in the alignment path."""
+    token_index: int
+    time_index: int
+    score: float
+@dataclass
+class BeamState:
+    """State in beam search backtracking."""
+    token_index: int
+    time_index: int
+    score: float
+    path: list[Point]
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
     @staticmethod
     def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
         """Build trellis for forced alignment using forward algorithm.
         Args:
             emission: Log-softmax emission matrix of shape (num_frames, num_classes)
+            tokens: List of target token indices
             blank_id: Index of the blank/CTC token (default 0)
         Returns:
                 stay = trellis[t, j] + emission[t, blank_id]
                 # Move: emit token j and advance to j+1 tokens
+                move = trellis[t, j - 1] + emission[t, tokens[j - 1]] if j > 0 else -float("inf")
                 trellis[t + 1, j] = max(stay, move)  # Viterbi: take best path
         while t > 0 and j > 0:
             # Check: did we transition from j-1 to j at frame t-1?
             stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
+            move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
         return token_spans
+    @staticmethod
+    def _backtrack_beam(
+        trellis: torch.Tensor,
+        emission: torch.Tensor,
+        tokens: list[int],
+        blank_id: int = 0,
+        beam_width: int = BEAM_WIDTH,
+    ) -> list[Point] | None:
+        """Beam search backtracking for better alignment paths.
+        Explores multiple candidate paths simultaneously, keeping the top beam_width
+        paths at each step. This can find better alignments than greedy backtracking.
+        Based on WhisperX implementation.
+        Args:
+            trellis: Trellis matrix from forward pass
+            emission: Log-softmax emission matrix
+            tokens: List of target token indices
+            blank_id: Index of the blank/CTC token
+            beam_width: Number of candidate paths to keep
+        Returns:
+            List of Points representing the best alignment path, or None if failed
+        """
+        T, J = trellis.size(0) - 1, trellis.size(1) - 1
+        if J == 0:
+            return None
+        init_state = BeamState(
+            token_index=J,
+            time_index=T,
+            score=trellis[T, J].item(),
+            path=[Point(J, T, emission[T, blank_id].exp().item())],
+        )
+        beams = [init_state]
+        while beams and beams[0].token_index > 0:
+            next_beams = []
+            for beam in beams:
+                t, j = beam.time_index, beam.token_index
+                if t <= 0:
+                    continue
+                p_stay = emission[t - 1, blank_id]
+                p_change = emission[t - 1, tokens[j - 1]] if j > 0 else float("-inf")
+                stay_score = trellis[t - 1, j].item()
+                change_score = trellis[t - 1, j - 1].item() if j > 0 else float("-inf")
+                # Stay option
+                if not math.isinf(stay_score):
+                    new_path = beam.path.copy()
+                    new_path.append(Point(j, t - 1, p_stay.exp().item()))
+                    next_beams.append(
+                        BeamState(
+                            token_index=j,
+                            time_index=t - 1,
+                            score=stay_score,
+                            path=new_path,
+                        )
+                    )
+                # Change option
+                if j > 0 and not math.isinf(change_score):
+                    new_path = beam.path.copy()
+                    new_path.append(Point(j - 1, t - 1, p_change.exp().item()))
+                    next_beams.append(
+                        BeamState(
+                            token_index=j - 1,
+                            time_index=t - 1,
+                            score=change_score,
+                            path=new_path,
+                        )
+                    )
+            # Keep top beam_width paths by score
+            beams = sorted(next_beams, key=lambda x: x.score, reverse=True)[:beam_width]
+            if not beams:
+                break
+        if not beams:
+            return None
+        # Fill remaining time steps with blank emissions
+        best_beam = beams[0]
+        t = best_beam.time_index
+        j = best_beam.token_index
+        while t > 0:
+            prob = emission[t - 1, blank_id].exp().item()
+            best_beam.path.append(Point(j, t - 1, prob))
+            t -= 1
+        return best_beam.path[::-1]
+    @staticmethod
+    def _path_to_spans(
+        path: list[Point], tokens: list[int]
+    ) -> list[tuple[int, float, float]]:
+        """Convert a beam search path to token spans.
+        Args:
+            path: List of Points from beam search
+            tokens: List of target token indices
+        Returns:
+            List of (token_id, start_frame, end_frame) tuples
+        """
+        if not path or not tokens:
+            return []
+        num_tokens = len(tokens)
+        token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
+        # Group frames by token index
+        for point in path:
+            if 0 < point.token_index <= num_tokens:
+                token_frames[point.token_index - 1].append(point.time_index)
+        # Convert to spans
+        token_spans: list[tuple[int, float, float]] = []
+        for token_idx, frames in enumerate(token_frames):
+            if not frames:
+                # Token never emitted - assign minimal span after previous
+                if token_spans:
+                    prev_end = token_spans[-1][2]
+                    frames = [int(prev_end)]
+                else:
+                    frames = [0]
+            token_id = tokens[token_idx]
+            start_frame = float(min(frames))
+            end_frame = float(max(frames)) + 1.0
+            token_spans.append((token_id, start_frame, end_frame))
+        return token_spans
     @classmethod
     def align(
         cls,
         emission = emissions[0].cpu()
+        # Normalize text: uppercase, keep only valid characters
         transcript = text.upper()
         # Build tokens from transcript (including word separators)
         tokens = []
         for char in transcript:
             if char in dictionary:
                 tokens.append(dictionary[char])
             elif char == " ":
                 tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
         if not tokens:
             return []
         # Build Viterbi trellis and backtrack for optimal path
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
+        # Try beam search first, fall back to greedy if it fails
+        beam_path = cls._backtrack_beam(trellis, emission, tokens, blank_id=0)
+        if beam_path is not None:
+            alignment_path = cls._path_to_spans(beam_path, tokens)
+        else:
+            alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate