mazesmazes
/

tiny-audio

@@ -197,7 +197,8 @@ class ForcedAligner:
         import torchaudio
         device = _get_device()
-        model, labels, dictionary = cls.get_instance(device)
         # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
@@ -259,7 +260,11 @@ class ForcedAligner:
         for token_id, start_frame, end_frame in alignment_path:
             if token_id == separator_id:  # Word separator
-                if current_word_start is not None and word_idx < len(words):
                     start_time = max(0.0, current_word_start * frame_duration - start_offset)
                     end_time = max(0.0, current_word_end * frame_duration - end_offset)
                     word_timestamps.append(
@@ -278,7 +283,11 @@ class ForcedAligner:
                 current_word_end = end_frame
         # Don't forget the last word
-        if current_word_start is not None and word_idx < len(words):
             start_time = max(0.0, current_word_start * frame_duration - start_offset)
             end_time = max(0.0, current_word_end * frame_duration - end_offset)
             word_timestamps.append(

         import torchaudio
         device = _get_device()
+        model, _labels, dictionary = cls.get_instance(device)
+        assert cls._bundle is not None and dictionary is not None  # Initialized by get_instance
         # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
         for token_id, start_frame, end_frame in alignment_path:
             if token_id == separator_id:  # Word separator
+                if (
+                    current_word_start is not None
+                    and current_word_end is not None
+                    and word_idx < len(words)
+                ):
                     start_time = max(0.0, current_word_start * frame_duration - start_offset)
                     end_time = max(0.0, current_word_end * frame_duration - end_offset)
                     word_timestamps.append(
                 current_word_end = end_frame
         # Don't forget the last word
+        if (
+            current_word_start is not None
+            and current_word_end is not None
+            and word_idx < len(words)
+        ):
             start_time = max(0.0, current_word_start * frame_duration - start_offset)
             end_time = max(0.0, current_word_end * frame_duration - end_offset)
             word_timestamps.append(