mazesmazes
/

tiny-audio

@@ -1,9 +1,31 @@
 """Forced alignment for word-level timestamps using Wav2Vec2."""
 import numpy as np
 import torch
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
     if torch.cuda.is_available():
@@ -16,7 +38,7 @@ def _get_device() -> str:
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
-    Uses Viterbi trellis algorithm for optimal alignment path finding.
     """
     _bundle = None
@@ -78,75 +100,158 @@ class ForcedAligner:
         return trellis
     @staticmethod
-    def _backtrack(
-        trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
-    ) -> list[tuple[int, float, float]]:
-        """Backtrack through trellis to find optimal forced monotonic alignment.
-        Guarantees:
-        - All tokens are emitted exactly once
-        - Strictly monotonic: each token's frames come after previous token's
-        - No frame skipping or token teleporting
-        Returns list of (token_id, start_frame, end_frame) for each token.
         """
-        num_frames = emission.size(0)
-        num_tokens = len(tokens)
         if num_tokens == 0:
-            return []
-        # Find the best ending point (should be at num_tokens)
-        # But verify trellis reached a valid state
-        if trellis[num_frames, num_tokens] == -float("inf"):
-            # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
                 (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
                 for i in range(num_tokens)
             ]
-        # Backtrack: find where each token transition occurred
-        # path[i] = frame where token i was first emitted
         token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
-        t = num_frames
-        j = num_tokens
-        while t > 0 and j > 0:
-            # Check: did we transition from j-1 to j at frame t-1?
-            stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
-            move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
-            if move_score >= stay_score:
-                # Token j-1 was emitted at frame t-1
-                token_frames[j - 1].insert(0, t - 1)
-                j -= 1
-            # Always decrement time (monotonic)
-            t -= 1
-        # Handle any remaining tokens at the start (edge case)
-        while j > 0:
-            token_frames[j - 1].insert(0, 0)
-            j -= 1
         # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
-        for token_idx, frames in enumerate(token_frames):
             if not frames:
-                # Token never emitted - assign minimal span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
-                    frames = [int(prev_end)]
                 else:
-                    frames = [0]
-            token_id = tokens[token_idx]
-            frame_probs = emission[frames, token_id]
-            peak_idx = int(torch.argmax(frame_probs).item())
-            peak_frame = frames[peak_idx]
-            token_spans.append((token_id, float(peak_frame), float(peak_frame) + 1.0))
         return token_spans
@@ -158,22 +263,20 @@ class ForcedAligner:
     @classmethod
     def align(
         cls,
-        audio: np.ndarray,
         text: str,
         sample_rate: int = 16000,
-        _language: str = "eng",
-        _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
-        Uses Viterbi trellis algorithm for optimal forced alignment.
         Args:
-            audio: Audio waveform as numpy array
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
-            _language: ISO-639-3 language code (default "eng" for English, unused)
-            _batch_size: Batch size for alignment model (unused)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
@@ -181,7 +284,7 @@ class ForcedAligner:
         import torchaudio
         device = _get_device()
-        model, _labels, dictionary = cls.get_instance(device)
         assert cls._bundle is not None and dictionary is not None  # Initialized by get_instance
         # Convert audio to tensor (copy to ensure array is writable)
@@ -223,9 +326,10 @@ class ForcedAligner:
         if not tokens:
             return []
-        # Build Viterbi trellis and backtrack for optimal path
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
-        alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate

 """Forced alignment for word-level timestamps using Wav2Vec2."""
+import math
+from dataclasses import dataclass, field
 import numpy as np
 import torch
+@dataclass
+class Point:
+    """A point in the alignment path."""
+    token_index: int
+    time_index: int
+    score: float
+@dataclass
+class BeamState:
+    """State in beam search backtracking."""
+    token_index: int
+    time_index: int
+    score: float
+    path: list[Point] = field(default_factory=list)
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
     if torch.cuda.is_available():
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
+    Uses CTC trellis with beam search backtracking for optimal alignment path finding.
     """
     _bundle = None
         return trellis
     @staticmethod
+    def _backtrack_beam(
+        trellis: torch.Tensor,
+        emission: torch.Tensor,
+        tokens: list[int],
+        blank_id: int = 0,
+        beam_width: int = 5,
+    ) -> list[Point] | None:
+        """Beam search backtracking through trellis.
+        Maintains multiple hypotheses during decoding, pruning to top candidates
+        by cumulative score at each step.
+        Args:
+            trellis: Trellis matrix of shape (num_frames + 1, num_tokens + 1)
+            emission: Log-softmax emission matrix of shape (num_frames, num_classes)
+            tokens: List of target token indices
+            blank_id: Index of the blank/CTC token (default 0)
+            beam_width: Number of top paths to keep during beam search (default 5)
+        Returns:
+            List of Point objects representing the best alignment path, or None if failed.
         """
+        num_frames = trellis.size(0) - 1
+        num_tokens = trellis.size(1) - 1
         if num_tokens == 0:
+            return None
+        # Check if alignment is possible
+        if math.isinf(trellis[num_frames, num_tokens].item()):
+            return None
+        # Initialize beam with final state
+        init_state = BeamState(
+            token_index=num_tokens,
+            time_index=num_frames,
+            score=trellis[num_frames, num_tokens].item(),
+            path=[Point(num_tokens, num_frames, emission[num_frames - 1, blank_id].exp().item())],
+        )
+        beams = [init_state]
+        # Beam search backtracking
+        while beams and beams[0].token_index > 0:
+            next_beams = []
+            for beam in beams:
+                t, j = beam.time_index, beam.token_index
+                if t <= 0:
+                    continue
+                stay_score = trellis[t - 1, j].item()
+                change_score = trellis[t - 1, j - 1].item() if j > 0 else float("-inf")
+                # Stay transition (emit blank)
+                if not math.isinf(stay_score):
+                    prob = emission[t - 1, blank_id].exp().item()
+                    new_path = beam.path.copy()
+                    new_path.append(Point(j, t - 1, prob))
+                    next_beams.append(
+                        BeamState(
+                            token_index=j,
+                            time_index=t - 1,
+                            score=stay_score,
+                            path=new_path,
+                        )
+                    )
+                # Change transition (emit token)
+                if j > 0 and not math.isinf(change_score):
+                    prob = emission[t - 1, tokens[j - 1]].exp().item()
+                    new_path = beam.path.copy()
+                    new_path.append(Point(j - 1, t - 1, prob))
+                    next_beams.append(
+                        BeamState(
+                            token_index=j - 1,
+                            time_index=t - 1,
+                            score=change_score,
+                            path=new_path,
+                        )
+                    )
+            # Prune to top beam_width candidates
+            beams = sorted(next_beams, key=lambda x: x.score, reverse=True)[:beam_width]
+            if not beams:
+                break
+        if not beams:
+            return None
+        # Complete path to beginning
+        best_beam = beams[0]
+        t = best_beam.time_index
+        j = best_beam.token_index
+        while t > 0:
+            prob = emission[t - 1, blank_id].exp().item()
+            best_beam.path.append(Point(j, t - 1, prob))
+            t -= 1
+        return best_beam.path[::-1]
+    @staticmethod
+    def _path_to_spans(
+        path: list[Point] | None, tokens: list[int], num_frames: int
+    ) -> list[tuple[int, float, float]]:
+        """Convert beam search path to token spans.
+        Args:
+            path: List of Point objects from beam search, or None
+            tokens: List of target token indices
+            num_frames: Total number of frames
+        Returns:
+            List of (token_id, start_frame, end_frame) for each token.
+        """
+        num_tokens = len(tokens)
+        if path is None or num_tokens == 0:
+            # Fall back to uniform distribution
+            if num_tokens == 0:
+                return []
             frames_per_token = num_frames / num_tokens
             return [
                 (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
                 for i in range(num_tokens)
             ]
+        # Group frames by token index
         token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
+        for point in path:
+            # Token index in path is 1-indexed (0 = before first token)
+            if 0 < point.token_index <= num_tokens:
+                token_frames[point.token_index - 1].append(point.time_index)
         # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
+        for token_idx in range(num_tokens):
+            frames = token_frames[token_idx]
             if not frames:
+                # Token never emitted - assign span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
+                    start_frame = prev_end
                 else:
+                    start_frame = 0.0
+                token_spans.append((tokens[token_idx], start_frame, start_frame + 1.0))
+            else:
+                start_frame = float(min(frames))
+                end_frame = float(max(frames)) + 1.0
+                token_spans.append((tokens[token_idx], start_frame, end_frame))
         return token_spans
     @classmethod
     def align(
         cls,
+        audio: np.ndarray | torch.Tensor,
         text: str,
         sample_rate: int = 16000,
+        beam_width: int = 5,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
+        Uses CTC trellis with beam search backtracking for optimal forced alignment.
         Args:
+            audio: Audio waveform as numpy array or torch tensor
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
+            beam_width: Number of paths to keep during beam search (default 5)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         import torchaudio
         device = _get_device()
+        model, _, dictionary = cls.get_instance(device)
         assert cls._bundle is not None and dictionary is not None  # Initialized by get_instance
         # Convert audio to tensor (copy to ensure array is writable)
         if not tokens:
             return []
+        # Build CTC trellis and use beam search backtracking
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
+        path = cls._backtrack_beam(trellis, emission, tokens, blank_id=0, beam_width=beam_width)
+        alignment_path = cls._path_to_spans(path, tokens, emission.size(0))
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate