mazesmazes
/

tiny-audio

@@ -70,6 +70,11 @@ class ForcedAligner:
         trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
         trellis[0, 0] = 0
         for t in range(num_frames):
             for j in range(num_tokens + 1):
                 # Stay: emit blank and stay at j tokens
@@ -85,19 +90,16 @@ class ForcedAligner:
     @staticmethod
     def _backtrack(
         trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
-    ) -> list[tuple[int, float, float]]:
         """Backtrack through trellis to find optimal forced monotonic alignment.
-        Uses emission probability weighting for sub-frame precision. Since wav2vec2
-        has 20ms frame resolution, weighting by emission scores can improve accuracy
-        by estimating where within a frame the token boundary likely falls.
         Guarantees:
         - All tokens are emitted exactly once
         - Strictly monotonic: each token's frames come after previous token's
         - No frame skipping or token teleporting
-        Returns list of (token_id, start_frame, end_frame) for each token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
@@ -111,12 +113,12 @@ class ForcedAligner:
             # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
-                (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
                 for i in range(num_tokens)
             ]
         # Backtrack: find where each token transition occurred
-        # path[i] = list of (frame, score) tuples where token i was emitted
         token_frames: list[list[tuple[int, float]]] = [[] for _ in range(num_tokens)]
         t = num_frames
@@ -129,9 +131,9 @@ class ForcedAligner:
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
-                # Store frame index and emission probability for weighting
-                prob = emission[t - 1, tokens[j - 1]].exp().item()
-                token_frames[j - 1].insert(0, (t - 1, prob))
                 j -= 1
             # Always decrement time (monotonic)
             t -= 1
@@ -141,8 +143,8 @@ class ForcedAligner:
             token_frames[j - 1].insert(0, (0, 0.0))
             j -= 1
-        # Convert to spans with emission-weighted sub-frame precision
-        token_spans: list[tuple[int, float, float]] = []
         for token_idx, frames_with_scores in enumerate(token_frames):
             if not frames_with_scores:
                 # Token never emitted - assign minimal span after previous
@@ -154,24 +156,13 @@ class ForcedAligner:
             token_id = tokens[token_idx]
             frames = [f for f, _ in frames_with_scores]
-            scores = [s for _, s in frames_with_scores]
-            # Compute emission-weighted start position for sub-frame precision
-            # Weight shifts the position toward frames with higher emission probability
-            total_score = sum(scores)
-            if total_score > 0 and len(frames) > 1:
-                # Weighted centroid gives sub-frame precision
-                weighted_center = sum(f * s for f, s in zip(frames, scores)) / total_score
-                # Estimate start/end based on weighted center and span width
-                span_width = max(frames) - min(frames) + 1
-                start_frame = weighted_center - span_width / 2
-                end_frame = weighted_center + span_width / 2
-            else:
-                # Fall back to simple min/max
-                start_frame = float(min(frames))
-                end_frame = float(max(frames)) + 1.0
-            token_spans.append((token_id, start_frame, end_frame))
         return token_spans
@@ -255,22 +246,24 @@ class ForcedAligner:
         end_offset = END_OFFSET
         # Group aligned tokens into words based on pipe separator
         words = text.split()
         word_timestamps = []
-        current_word_start = None
-        current_word_end = None
         word_idx = 0
         separator_id = dictionary.get("|", dictionary.get(" ", 0))
-        for token_id, start_frame, end_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if (
-                    current_word_start is not None
-                    and current_word_end is not None
                     and word_idx < len(words)
                 ):
-                    start_time = max(0.0, current_word_start * frame_duration - start_offset)
-                    end_time = max(0.0, current_word_end * frame_duration - end_offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
@@ -279,21 +272,21 @@ class ForcedAligner:
                         }
                     )
                     word_idx += 1
-                current_word_start = None
-                current_word_end = None
             else:
-                if current_word_start is None:
-                    current_word_start = start_frame
-                current_word_end = end_frame
         # Don't forget the last word
         if (
-            current_word_start is not None
-            and current_word_end is not None
             and word_idx < len(words)
         ):
-            start_time = max(0.0, current_word_start * frame_duration - start_offset)
-            end_time = max(0.0, current_word_end * frame_duration - end_offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],

         trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
         trellis[0, 0] = 0
+        # Force alignment to use all tokens by preventing staying in blank
+        # at the end when there are still tokens to emit
+        if num_tokens > 1:
+            trellis[-num_tokens + 1:, 0] = float("inf")
         for t in range(num_frames):
             for j in range(num_tokens + 1):
                 # Stay: emit blank and stay at j tokens
     @staticmethod
     def _backtrack(
         trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
+    ) -> list[tuple[int, float, float, float]]:
         """Backtrack through trellis to find optimal forced monotonic alignment.
         Guarantees:
         - All tokens are emitted exactly once
         - Strictly monotonic: each token's frames come after previous token's
         - No frame skipping or token teleporting
+        Returns list of (token_id, start_frame, end_frame, peak_frame) for each token.
+        The peak_frame is the frame with highest emission probability for that token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
             # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
+                (tokens[i], i * frames_per_token, (i + 1) * frames_per_token, (i + 0.5) * frames_per_token)
                 for i in range(num_tokens)
             ]
         # Backtrack: find where each token transition occurred
+        # Store (frame, emission_score) for each token
         token_frames: list[list[tuple[int, float]]] = [[] for _ in range(num_tokens)]
         t = num_frames
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
+                # Store frame and its emission probability
+                emit_prob = emission[t - 1, tokens[j - 1]].exp().item()
+                token_frames[j - 1].insert(0, (t - 1, emit_prob))
                 j -= 1
             # Always decrement time (monotonic)
             t -= 1
             token_frames[j - 1].insert(0, (0, 0.0))
             j -= 1
+        # Convert to spans with peak frame
+        token_spans: list[tuple[int, float, float, float]] = []
         for token_idx, frames_with_scores in enumerate(token_frames):
             if not frames_with_scores:
                 # Token never emitted - assign minimal span after previous
             token_id = tokens[token_idx]
             frames = [f for f, _ in frames_with_scores]
+            start_frame = float(min(frames))
+            end_frame = float(max(frames)) + 1.0
+            # Find peak frame (highest emission probability)
+            peak_frame, _ = max(frames_with_scores, key=lambda x: x[1])
+            token_spans.append((token_id, start_frame, end_frame, float(peak_frame)))
         return token_spans
         end_offset = END_OFFSET
         # Group aligned tokens into words based on pipe separator
+        # Use peak emission frame for more accurate word boundaries
         words = text.split()
         word_timestamps = []
+        first_char_peak = None
+        last_char_peak = None
         word_idx = 0
         separator_id = dictionary.get("|", dictionary.get(" ", 0))
+        for token_id, start_frame, end_frame, peak_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if (
+                    first_char_peak is not None
+                    and last_char_peak is not None
                     and word_idx < len(words)
                 ):
+                    # Use peak frames for word boundaries
+                    start_time = max(0.0, first_char_peak * frame_duration - start_offset)
+                    end_time = max(0.0, (last_char_peak + 1) * frame_duration - end_offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
                         }
                     )
                     word_idx += 1
+                first_char_peak = None
+                last_char_peak = None
             else:
+                if first_char_peak is None:
+                    first_char_peak = peak_frame
+                last_char_peak = peak_frame
         # Don't forget the last word
         if (
+            first_char_peak is not None
+            and last_char_peak is not None
             and word_idx < len(words)
         ):
+            start_time = max(0.0, first_char_peak * frame_duration - start_offset)
+            end_time = max(0.0, (last_char_peak + 1) * frame_duration - end_offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],