mazesmazes
/

tiny-audio

@@ -1,14 +1,8 @@
 """Forced alignment for word-level timestamps using Wav2Vec2."""
-import math
-from dataclasses import dataclass
 import numpy as np
 import torch
-# Beam search width for backtracking (from WhisperX)
-BEAM_WIDTH = 2
 # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
 # Calibrated on librispeech-alignments dataset
 START_OFFSET = 0.06  # Subtract from start times (shift earlier)
@@ -24,25 +18,6 @@ def _get_device() -> str:
     return "cpu"
-@dataclass
-class Point:
-    """A point in the alignment path."""
-    token_index: int
-    time_index: int
-    score: float
-@dataclass
-class BeamState:
-    """State in beam search backtracking."""
-    token_index: int
-    time_index: int
-    score: float
-    path: list[Point]
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
@@ -113,6 +88,10 @@ class ForcedAligner:
     ) -> list[tuple[int, float, float]]:
         """Backtrack through trellis to find optimal forced monotonic alignment.
         Guarantees:
         - All tokens are emitted exactly once
         - Strictly monotonic: each token's frames come after previous token's
@@ -137,8 +116,8 @@ class ForcedAligner:
             ]
         # Backtrack: find where each token transition occurred
-        # path[i] = frame where token i was first emitted
-        token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
         t = num_frames
         j = num_tokens
@@ -150,172 +129,48 @@ class ForcedAligner:
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
-                token_frames[j - 1].insert(0, t - 1)
                 j -= 1
             # Always decrement time (monotonic)
             t -= 1
         # Handle any remaining tokens at the start (edge case)
         while j > 0:
-            token_frames[j - 1].insert(0, 0)
             j -= 1
-        # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
-        for token_idx, frames in enumerate(token_frames):
-            if not frames:
                 # Token never emitted - assign minimal span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
-                    frames = [int(prev_end)]
                 else:
-                    frames = [0]
             token_id = tokens[token_idx]
-            start_frame = float(min(frames))
-            end_frame = float(max(frames)) + 1.0
-            token_spans.append((token_id, start_frame, end_frame))
-        return token_spans
-    @staticmethod
-    def _backtrack_beam(
-        trellis: torch.Tensor,
-        emission: torch.Tensor,
-        tokens: list[int],
-        blank_id: int = 0,
-        beam_width: int = BEAM_WIDTH,
-    ) -> list[Point] | None:
-        """Beam search backtracking for better alignment paths.
-        Explores multiple candidate paths simultaneously, keeping the top beam_width
-        paths at each step. This can find better alignments than greedy backtracking.
-        Based on WhisperX implementation.
-        Args:
-            trellis: Trellis matrix from forward pass
-            emission: Log-softmax emission matrix
-            tokens: List of target token indices
-            blank_id: Index of the blank/CTC token
-            beam_width: Number of candidate paths to keep
-        Returns:
-            List of Points representing the best alignment path, or None if failed
-        """
-        T, J = trellis.size(0) - 1, trellis.size(1) - 1
-        if J == 0:
-            return None
-        init_state = BeamState(
-            token_index=J,
-            time_index=T,
-            score=trellis[T, J].item(),
-            path=[Point(J, T, emission[T, blank_id].exp().item())],
-        )
-        beams = [init_state]
-        while beams and beams[0].token_index > 0:
-            next_beams = []
-            for beam in beams:
-                t, j = beam.time_index, beam.token_index
-                if t <= 0:
-                    continue
-                p_stay = emission[t - 1, blank_id]
-                p_change = emission[t - 1, tokens[j - 1]] if j > 0 else float("-inf")
-                stay_score = trellis[t - 1, j].item()
-                change_score = trellis[t - 1, j - 1].item() if j > 0 else float("-inf")
-                # Stay option
-                if not math.isinf(stay_score):
-                    new_path = beam.path.copy()
-                    new_path.append(Point(j, t - 1, p_stay.exp().item()))
-                    next_beams.append(
-                        BeamState(
-                            token_index=j,
-                            time_index=t - 1,
-                            score=stay_score,
-                            path=new_path,
-                        )
-                    )
-                # Change option
-                if j > 0 and not math.isinf(change_score):
-                    new_path = beam.path.copy()
-                    new_path.append(Point(j - 1, t - 1, p_change.exp().item()))
-                    next_beams.append(
-                        BeamState(
-                            token_index=j - 1,
-                            time_index=t - 1,
-                            score=change_score,
-                            path=new_path,
-                        )
-                    )
-            # Keep top beam_width paths by score
-            beams = sorted(next_beams, key=lambda x: x.score, reverse=True)[:beam_width]
-            if not beams:
-                break
-        if not beams:
-            return None
-        # Fill remaining time steps with blank emissions
-        best_beam = beams[0]
-        t = best_beam.time_index
-        j = best_beam.token_index
-        while t > 0:
-            prob = emission[t - 1, blank_id].exp().item()
-            best_beam.path.append(Point(j, t - 1, prob))
-            t -= 1
-        return best_beam.path[::-1]
-    @staticmethod
-    def _path_to_spans(
-        path: list[Point], tokens: list[int]
-    ) -> list[tuple[int, float, float]]:
-        """Convert a beam search path to token spans.
-        Args:
-            path: List of Points from beam search
-            tokens: List of target token indices
-        Returns:
-            List of (token_id, start_frame, end_frame) tuples
-        """
-        if not path or not tokens:
-            return []
-        num_tokens = len(tokens)
-        token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
-        # Group frames by token index
-        for point in path:
-            if 0 < point.token_index <= num_tokens:
-                token_frames[point.token_index - 1].append(point.time_index)
-        # Convert to spans
-        token_spans: list[tuple[int, float, float]] = []
-        for token_idx, frames in enumerate(token_frames):
-            if not frames:
-                # Token never emitted - assign minimal span after previous
-                if token_spans:
-                    prev_end = token_spans[-1][2]
-                    frames = [int(prev_end)]
-                else:
-                    frames = [0]
-            token_id = tokens[token_idx]
-            start_frame = float(min(frames))
-            end_frame = float(max(frames)) + 1.0
             token_spans.append((token_id, start_frame, end_frame))
         return token_spans
@@ -390,13 +245,7 @@ class ForcedAligner:
         # Build Viterbi trellis and backtrack for optimal path
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
-        # Try beam search first, fall back to greedy if it fails
-        beam_path = cls._backtrack_beam(trellis, emission, tokens, blank_id=0)
-        if beam_path is not None:
-            alignment_path = cls._path_to_spans(beam_path, tokens)
-        else:
-            alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate

 """Forced alignment for word-level timestamps using Wav2Vec2."""
 import numpy as np
 import torch
 # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
 # Calibrated on librispeech-alignments dataset
 START_OFFSET = 0.06  # Subtract from start times (shift earlier)
     return "cpu"
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2.
     ) -> list[tuple[int, float, float]]:
         """Backtrack through trellis to find optimal forced monotonic alignment.
+        Uses emission probability weighting for sub-frame precision. Since wav2vec2
+        has 20ms frame resolution, weighting by emission scores can improve accuracy
+        by estimating where within a frame the token boundary likely falls.
         Guarantees:
         - All tokens are emitted exactly once
         - Strictly monotonic: each token's frames come after previous token's
             ]
         # Backtrack: find where each token transition occurred
+        # path[i] = list of (frame, score) tuples where token i was emitted
+        token_frames: list[list[tuple[int, float]]] = [[] for _ in range(num_tokens)]
         t = num_frames
         j = num_tokens
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
+                # Store frame index and emission probability for weighting
+                prob = emission[t - 1, tokens[j - 1]].exp().item()
+                token_frames[j - 1].insert(0, (t - 1, prob))
                 j -= 1
             # Always decrement time (monotonic)
             t -= 1
         # Handle any remaining tokens at the start (edge case)
         while j > 0:
+            token_frames[j - 1].insert(0, (0, 0.0))
             j -= 1
+        # Convert to spans with emission-weighted sub-frame precision
         token_spans: list[tuple[int, float, float]] = []
+        for token_idx, frames_with_scores in enumerate(token_frames):
+            if not frames_with_scores:
                 # Token never emitted - assign minimal span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
+                    frames_with_scores = [(int(prev_end), 0.0)]
                 else:
+                    frames_with_scores = [(0, 0.0)]
             token_id = tokens[token_idx]
+            frames = [f for f, _ in frames_with_scores]
+            scores = [s for _, s in frames_with_scores]
+            # Compute emission-weighted start position for sub-frame precision
+            # Weight shifts the position toward frames with higher emission probability
+            total_score = sum(scores)
+            if total_score > 0 and len(frames) > 1:
+                # Weighted centroid gives sub-frame precision
+                weighted_center = sum(f * s for f, s in zip(frames, scores)) / total_score
+                # Estimate start/end based on weighted center and span width
+                span_width = max(frames) - min(frames) + 1
+                start_frame = weighted_center - span_width / 2
+                end_frame = weighted_center + span_width / 2
+            else:
+                # Fall back to simple min/max
+                start_frame = float(min(frames))
+                end_frame = float(max(frames)) + 1.0
             token_spans.append((token_id, start_frame, end_frame))
         return token_spans
         # Build Viterbi trellis and backtrack for optimal path
         trellis = cls._get_trellis(emission, tokens, blank_id=0)
+        alignment_path = cls._backtrack(trellis, emission, tokens, blank_id=0)
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate