mazesmazes
/

tiny-audio

@@ -56,10 +56,10 @@ class ForcedAligner:
     @staticmethod
     def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
-        """Build Viterbi trellis for forced alignment.
-        The trellis is a 2D matrix where trellis[t, j] represents the log probability
-        of the most likely path that has emitted j tokens at time t.
         Args:
             emission: Log-softmax emission matrix of shape (num_frames, num_classes)
@@ -72,25 +72,21 @@ class ForcedAligner:
         num_frames = emission.size(0)
         num_tokens = len(tokens)
-        # Initialize trellis with -inf (impossible paths)
         trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
-        trellis[0, 0] = 0  # Start state has probability 1
         for t in range(num_frames):
             for j in range(num_tokens + 1):
-                # Stay in current state (emit blank)
-                if j < num_tokens + 1:
-                    stay_prob = trellis[t, j] + emission[t, blank_id]
-                else:
-                    stay_prob = -float("inf")
-                # Move to next state (emit token)
                 if j > 0:
-                    move_prob = trellis[t, j - 1] + emission[t, tokens[j - 1]]
                 else:
-                    move_prob = -float("inf")
-                trellis[t + 1, j] = max(stay_prob, move_prob)
         return trellis
@@ -100,60 +96,63 @@ class ForcedAligner:
     ) -> list[tuple[int, int, int]]:
         """Backtrack through trellis to find optimal alignment path.
-        Args:
-            trellis: Trellis matrix from _get_trellis
-            emission: Log-softmax emission matrix
-            tokens: List of target token indices
-            blank_id: Index of the blank/CTC token
-        Returns:
-            List of (token_idx, start_frame, end_frame) tuples
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
-        # Start from the end
         t = num_frames
         j = num_tokens
-        path = []
-        # Backtrack to find where each token was emitted
-        while j > 0:
-            # Find the frame where token j-1 was first emitted
-            token_end = t
-            # Walk back while staying in state j (emitting blanks)
-            while t > 0:
-                stay_prob = trellis[t - 1, j] + emission[t - 1, blank_id]
-                if j > 0:
-                    move_prob = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
-                else:
-                    move_prob = -float("inf")
-                # Check if we moved into this state or stayed
-                if move_prob > stay_prob:
-                    # We moved into state j at time t-1
-                    token_start = t - 1
-                    path.append((tokens[j - 1], token_start, token_end))
-                    j -= 1
-                    t -= 1
-                    break
-                else:
-                    # We stayed in state j
-                    t -= 1
-            if t == 0 and j > 0:
-                # Handle edge case: remaining tokens at the start
-                path.append((tokens[j - 1], 0, token_end))
                 j -= 1
-        # Reverse to get chronological order
         path.reverse()
-        return path
     # Sub-frame offset to compensate for Wav2Vec2 convolutional look-ahead (in seconds)
     # This makes timestamps feel more "natural" by shifting them earlier
-    OFFSET_COMPENSATION = 0.04  # 40ms
     @classmethod
     def align(

     @staticmethod
     def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
+        """Build trellis for forced alignment using forward algorithm.
+        The trellis[t, j] represents the log probability of the best path that
+        aligns the first j tokens to the first t frames.
         Args:
             emission: Log-softmax emission matrix of shape (num_frames, num_classes)
         num_frames = emission.size(0)
         num_tokens = len(tokens)
         trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
+        trellis[0, 0] = 0
         for t in range(num_frames):
             for j in range(num_tokens + 1):
+                # Stay: emit blank and stay at j tokens
+                stay = trellis[t, j] + emission[t, blank_id]
+                # Move: emit token j and advance to j+1 tokens
                 if j > 0:
+                    move = trellis[t, j - 1] + emission[t, tokens[j - 1]]
                 else:
+                    move = torch.tensor(-float("inf"))
+                trellis[t + 1, j] = torch.logaddexp(torch.tensor(stay), move).item()
         return trellis
     ) -> list[tuple[int, int, int]]:
         """Backtrack through trellis to find optimal alignment path.
+        Returns list of (token_id, start_frame, end_frame) for each token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
+        # Trace back from final state
         t = num_frames
         j = num_tokens
+        path = []  # Will store (frame, token_index) pairs
+        while t > 0 and j >= 0:
+            # At position (t, j), we need to determine if we got here by:
+            # 1. Staying at j (emitting blank at frame t-1)
+            # 2. Moving from j-1 to j (emitting token j-1 at frame t-1)
+            if j == 0:
+                # Can only stay (no previous token state to come from)
+                t -= 1
+                continue
+            # Compare which transition was more likely
+            stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
+            move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
+            if move_score > stay_score:
+                # Token j-1 was emitted at frame t-1
+                path.append((t - 1, j - 1))
                 j -= 1
+            t -= 1
         path.reverse()
+        # Convert path to token spans with start/end frames
+        if not path:
+            return []
+        token_spans = []
+        i = 0
+        while i < len(path):
+            frame, token_idx = path[i]
+            start_frame = frame
+            # Find end frame (where this token stops being emitted)
+            end_frame = frame + 1
+            while i + 1 < len(path) and path[i + 1][1] == token_idx:
+                i += 1
+                end_frame = path[i][0] + 1
+            token_spans.append((tokens[token_idx], start_frame, end_frame))
+            i += 1
+        return token_spans
     # Sub-frame offset to compensate for Wav2Vec2 convolutional look-ahead (in seconds)
     # This makes timestamps feel more "natural" by shifting them earlier
+    OFFSET_COMPENSATION = 0.02  # 40ms
     @classmethod
     def align(