mazesmazes
/

tiny-audio

@@ -1,31 +1,9 @@
 """Forced alignment for word-level timestamps using Wav2Vec2."""
-import math
-from dataclasses import dataclass, field
 import numpy as np
 import torch
-@dataclass
-class Point:
-    """A point in the alignment path."""
-    token_index: int
-    time_index: int
-    score: float
-@dataclass
-class BeamState:
-    """State in beam search backtracking."""
-    token_index: int
-    time_index: int
-    score: float
-    path: list[Point] = field(default_factory=list)
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
     if torch.cuda.is_available():
@@ -38,7 +16,7 @@ def _get_device() -> str:
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
-    Uses CTC trellis with beam search backtracking for optimal alignment path finding.
     """
     _bundle = None
@@ -100,158 +78,73 @@ class ForcedAligner:
         return trellis
     @staticmethod
-    def _backtrack_beam(
-        trellis: torch.Tensor,
-        emission: torch.Tensor,
-        tokens: list[int],
-        blank_id: int = 0,
-        beam_width: int = 5,
-    ) -> list[Point] | None:
-        """Beam search backtracking through trellis.
-        Maintains multiple hypotheses during decoding, pruning to top candidates
-        by cumulative score at each step.
-        Args:
-            trellis: Trellis matrix of shape (num_frames + 1, num_tokens + 1)
-            emission: Log-softmax emission matrix of shape (num_frames, num_classes)
-            tokens: List of target token indices
-            blank_id: Index of the blank/CTC token (default 0)
-            beam_width: Number of top paths to keep during beam search (default 5)
-        Returns:
-            List of Point objects representing the best alignment path, or None if failed.
-        """
-        num_frames = trellis.size(0) - 1
-        num_tokens = trellis.size(1) - 1
-        if num_tokens == 0:
-            return None
-        # Check if alignment is possible
-        if math.isinf(trellis[num_frames, num_tokens].item()):
-            return None
-        # Initialize beam with final state
-        init_state = BeamState(
-            token_index=num_tokens,
-            time_index=num_frames,
-            score=trellis[num_frames, num_tokens].item(),
-            path=[Point(num_tokens, num_frames, emission[num_frames - 1, blank_id].exp().item())],
-        )
-        beams = [init_state]
-        # Beam search backtracking
-        while beams and beams[0].token_index > 0:
-            next_beams = []
-            for beam in beams:
-                t, j = beam.time_index, beam.token_index
-                if t <= 0:
-                    continue
-                stay_score = trellis[t - 1, j].item()
-                change_score = trellis[t - 1, j - 1].item() if j > 0 else float("-inf")
-                # Stay transition (emit blank)
-                if not math.isinf(stay_score):
-                    prob = emission[t - 1, blank_id].exp().item()
-                    new_path = beam.path.copy()
-                    new_path.append(Point(j, t - 1, prob))
-                    next_beams.append(
-                        BeamState(
-                            token_index=j,
-                            time_index=t - 1,
-                            score=stay_score,
-                            path=new_path,
-                        )
-                    )
-                # Change transition (emit token)
-                if j > 0 and not math.isinf(change_score):
-                    prob = emission[t - 1, tokens[j - 1]].exp().item()
-                    new_path = beam.path.copy()
-                    new_path.append(Point(j - 1, t - 1, prob))
-                    next_beams.append(
-                        BeamState(
-                            token_index=j - 1,
-                            time_index=t - 1,
-                            score=change_score,
-                            path=new_path,
-                        )
-                    )
-            # Prune to top beam_width candidates
-            beams = sorted(next_beams, key=lambda x: x.score, reverse=True)[:beam_width]
-            if not beams:
-                break
-        if not beams:
-            return None
-        # Complete path to beginning
-        best_beam = beams[0]
-        t = best_beam.time_index
-        j = best_beam.token_index
-        while t > 0:
-            prob = emission[t - 1, blank_id].exp().item()
-            best_beam.path.append(Point(j, t - 1, prob))
-            t -= 1
-        return best_beam.path[::-1]
-    @staticmethod
-    def _path_to_spans(
-        path: list[Point] | None, tokens: list[int], num_frames: int
     ) -> list[tuple[int, float, float]]:
-        """Convert beam search path to token spans.
-        Args:
-            path: List of Point objects from beam search, or None
-            tokens: List of target token indices
-            num_frames: Total number of frames
-        Returns:
-            List of (token_id, start_frame, end_frame) for each token.
         """
         num_tokens = len(tokens)
-        if path is None or num_tokens == 0:
-            # Fall back to uniform distribution
-            if num_tokens == 0:
-                return []
             frames_per_token = num_frames / num_tokens
             return [
                 (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
                 for i in range(num_tokens)
             ]
-        # Group frames by token index
         token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
-        for point in path:
-            # Token index in path is 1-indexed (0 = before first token)
-            if 0 < point.token_index <= num_tokens:
-                token_frames[point.token_index - 1].append(point.time_index)
         # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
-        for token_idx in range(num_tokens):
-            frames = token_frames[token_idx]
             if not frames:
-                # Token never emitted - assign span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
-                    start_frame = prev_end
                 else:
-                    start_frame = 0.0
-                token_spans.append((tokens[token_idx], start_frame, start_frame + 1.0))
-            else:
-                start_frame = float(min(frames))
-                end_frame = float(max(frames)) + 1.0
-                token_spans.append((tokens[token_idx], start_frame, end_frame))
         return token_spans
@@ -263,20 +156,22 @@ class ForcedAligner:
     @classmethod
     def align(
         cls,
-        audio: np.ndarray | torch.Tensor,
         text: str,
         sample_rate: int = 16000,
-        beam_width: int = 5,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
-        Uses CTC trellis with beam search backtracking for optimal forced alignment.
         Args:
-            audio: Audio waveform as numpy array or torch tensor
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
-            beam_width: Number of paths to keep during beam search (default 5)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
@@ -284,7 +179,7 @@ class ForcedAligner:
         import torchaudio
         device = _get_device()
-        model, _, dictionary = cls.get_instance(device)
         assert cls._bundle is not None and dictionary is not None  # Initialized by get_instance
         # Convert audio to tensor (copy to ensure array is writable)
@@ -326,10 +221,9 @@ class ForcedAligner:
         if not tokens:
             return []
-        # Build CTC trellis and use beam search backtracking
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
-        path = cls._backtrack_beam(trellis, emission, tokens, blank_id=0, beam_width=beam_width)
-        alignment_path = cls._path_to_spans(path, tokens, emission.size(0))
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate

 """Forced alignment for word-level timestamps using Wav2Vec2."""
 import numpy as np
 import torch
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
     if torch.cuda.is_available():
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
+    Uses Viterbi trellis algorithm for optimal alignment path finding.
     """
     _bundle = None
         return trellis
     @staticmethod
+    def _backtrack(
+        trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
     ) -> list[tuple[int, float, float]]:
+        """Backtrack through trellis to find optimal forced monotonic alignment.
+        Guarantees:
+        - All tokens are emitted exactly once
+        - Strictly monotonic: each token's frames come after previous token's
+        - No frame skipping or token teleporting
+        Returns list of (token_id, start_frame, end_frame) for each token.
         """
+        num_frames = emission.size(0)
         num_tokens = len(tokens)
+        if num_tokens == 0:
+            return []
+        # Find the best ending point (should be at num_tokens)
+        # But verify trellis reached a valid state
+        if trellis[num_frames, num_tokens] == -float("inf"):
+            # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
                 (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
                 for i in range(num_tokens)
             ]
+        # Backtrack: find where each token transition occurred
+        # path[i] = frame where token i was first emitted
         token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
+        t = num_frames
+        j = num_tokens
+        while t > 0 and j > 0:
+            # Check: did we transition from j-1 to j at frame t-1?
+            stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
+            move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
+            if move_score >= stay_score:
+                # Token j-1 was emitted at frame t-1
+                token_frames[j - 1].insert(0, t - 1)
+                j -= 1
+            # Always decrement time (monotonic)
+            t -= 1
+        # Handle any remaining tokens at the start (edge case)
+        while j > 0:
+            token_frames[j - 1].insert(0, 0)
+            j -= 1
         # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
+        for token_idx, frames in enumerate(token_frames):
             if not frames:
+                # Token never emitted - assign minimal span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
+                    frames = [int(prev_end)]
                 else:
+                    frames = [0]
+            token_id = tokens[token_idx]
+            start_frame = float(min(frames))
+            end_frame = float(max(frames)) + 1.0
+            token_spans.append((token_id, start_frame, end_frame))
         return token_spans
     @classmethod
     def align(
         cls,
+        audio: np.ndarray,
         text: str,
         sample_rate: int = 16000,
+        _language: str = "eng",
+        _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
+        Uses Viterbi trellis algorithm for optimal forced alignment.
         Args:
+            audio: Audio waveform as numpy array
             text: Transcript text to align
             sample_rate: Audio sample rate (default 16000)
+            _language: ISO-639-3 language code (default "eng" for English, unused)
+            _batch_size: Batch size for alignment model (unused)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         import torchaudio
         device = _get_device()
+        model, _labels, dictionary = cls.get_instance(device)
         assert cls._bundle is not None and dictionary is not None  # Initialized by get_instance
         # Convert audio to tensor (copy to ensure array is writable)
         if not tokens:
             return []
+        # Build Viterbi trellis and backtrack for optimal path
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
+        alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate