mazesmazes
/

tiny-audio

@@ -3,6 +3,14 @@
 import numpy as np
 import torch
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
@@ -44,6 +52,30 @@ class ForcedAligner:
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
     @staticmethod
     def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
         """Build trellis for forced alignment using forward algorithm.
@@ -53,7 +85,7 @@ class ForcedAligner:
         Args:
             emission: Log-softmax emission matrix of shape (num_frames, num_classes)
-            tokens: List of target token indices
             blank_id: Index of the blank/CTC token (default 0)
         Returns:
@@ -71,7 +103,13 @@ class ForcedAligner:
                 stay = trellis[t, j] + emission[t, blank_id]
                 # Move: emit token j and advance to j+1 tokens
-                move = trellis[t, j - 1] + emission[t, tokens[j - 1]] if j > 0 else -float("inf")
                 trellis[t + 1, j] = max(stay, move)  # Viterbi: take best path
@@ -116,7 +154,10 @@ class ForcedAligner:
         while t > 0 and j > 0:
             # Check: did we transition from j-1 to j at frame t-1?
             stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
-            move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
@@ -148,11 +189,6 @@ class ForcedAligner:
         return token_spans
-    # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
-    # Calibrated on librispeech-alignments dataset
-    START_OFFSET = 0.06  # Subtract from start times (shift earlier)
-    END_OFFSET = -0.03  # Add to end times (shift later)
     @classmethod
     def align(
         cls,
@@ -207,16 +243,20 @@ class ForcedAligner:
         emission = emissions[0].cpu()
-        # Normalize text: uppercase, keep only valid characters
         transcript = text.upper()
         # Build tokens from transcript (including word separators)
         tokens = []
         for char in transcript:
             if char in dictionary:
                 tokens.append(dictionary[char])
             elif char == " ":
                 tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
         if not tokens:
             return []
@@ -229,8 +269,8 @@ class ForcedAligner:
         frame_duration = 320 / cls._bundle.sample_rate
         # Apply separate offset compensation for start/end (Wav2Vec2 systematic bias)
-        start_offset = cls.START_OFFSET
-        end_offset = cls.END_OFFSET
         # Group aligned tokens into words based on pipe separator
         words = text.split()

 import numpy as np
 import torch
+# Wildcard token ID for out-of-vocabulary characters
+WILDCARD_TOKEN = -1
+# Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
+# Calibrated on librispeech-alignments dataset
+START_OFFSET = 0.06  # Subtract from start times (shift earlier)
+END_OFFSET = -0.03  # Add to end times (shift later)
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
+    @staticmethod
+    def _get_emission_score(
+        emission: torch.Tensor, token: int, blank_id: int = 0
+    ) -> torch.Tensor:
+        """Get emission score for a token, handling wildcards.
+        For wildcard tokens (WILDCARD_TOKEN), returns the max score over all
+        non-blank tokens - allowing any character to match.
+        Args:
+            emission: Emission vector for a single frame (num_classes,)
+            token: Token index, or WILDCARD_TOKEN for out-of-vocabulary chars
+            blank_id: Index of the blank/CTC token
+        Returns:
+            Emission score (scalar tensor)
+        """
+        if token == WILDCARD_TOKEN:
+            # Wildcard: take max over all non-blank tokens
+            mask = torch.ones(emission.size(0), dtype=torch.bool)
+            mask[blank_id] = False
+            return emission[mask].max()
+        return emission[token]
     @staticmethod
     def _get_trellis(emission: torch.Tensor, tokens: list[int], blank_id: int = 0) -> torch.Tensor:
         """Build trellis for forced alignment using forward algorithm.
         Args:
             emission: Log-softmax emission matrix of shape (num_frames, num_classes)
+            tokens: List of target token indices (WILDCARD_TOKEN for OOV chars)
             blank_id: Index of the blank/CTC token (default 0)
         Returns:
                 stay = trellis[t, j] + emission[t, blank_id]
                 # Move: emit token j and advance to j+1 tokens
+                if j > 0:
+                    token_score = ForcedAligner._get_emission_score(
+                        emission[t], tokens[j - 1], blank_id
+                    )
+                    move = trellis[t, j - 1] + token_score
+                else:
+                    move = -float("inf")
                 trellis[t + 1, j] = max(stay, move)  # Viterbi: take best path
         while t > 0 and j > 0:
             # Check: did we transition from j-1 to j at frame t-1?
             stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
+            token_score = ForcedAligner._get_emission_score(
+                emission[t - 1], tokens[j - 1], blank_id
+            )
+            move_score = trellis[t - 1, j - 1] + token_score
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
         return token_spans
     @classmethod
     def align(
         cls,
         emission = emissions[0].cpu()
+        # Normalize text: uppercase
         transcript = text.upper()
         # Build tokens from transcript (including word separators)
+        # Unknown characters get WILDCARD_TOKEN which matches any non-blank emission
         tokens = []
         for char in transcript:
             if char in dictionary:
                 tokens.append(dictionary[char])
             elif char == " ":
                 tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
+            else:
+                # Out-of-vocabulary character - use wildcard
+                tokens.append(WILDCARD_TOKEN)
         if not tokens:
             return []
         frame_duration = 320 / cls._bundle.sample_rate
         # Apply separate offset compensation for start/end (Wav2Vec2 systematic bias)
+        start_offset = START_OFFSET
+        end_offset = END_OFFSET
         # Group aligned tokens into words based on pipe separator
         words = text.split()