mazesmazes
/

tiny-audio

@@ -30,12 +30,6 @@ class ForcedAligner:
     _model = None
     _labels = None
     _dictionary = None
-    _vad_model = None
-    # VAD parameters
-    VAD_HOP_SIZE = 256  # TEN-VAD frame size (16ms at 16kHz)
-    VAD_THRESHOLD = 0.5  # Speech detection threshold
-    VAD_MAX_GAP = 0.15  # Max gap to merge speech segments (seconds)
     @classmethod
     def get_instance(cls, device: str = "cuda"):
@@ -57,135 +51,6 @@ class ForcedAligner:
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
-    @classmethod
-    def _get_vad_model(cls):
-        """Lazy-load TEN-VAD model (singleton)."""
-        if cls._vad_model is None:
-            from ten_vad import TenVad
-            cls._vad_model = TenVad(hop_size=cls.VAD_HOP_SIZE, threshold=cls.VAD_THRESHOLD)
-        return cls._vad_model
-    @classmethod
-    def _get_speech_regions(
-        cls, audio: np.ndarray, sample_rate: int = 16000
-    ) -> list[tuple[float, float]]:
-        """Get speech regions using TEN-VAD.
-        Args:
-            audio: Audio waveform as numpy array
-            sample_rate: Audio sample rate
-        Returns:
-            List of (start_time, end_time) tuples for speech regions
-        """
-        vad_model = cls._get_vad_model()
-        # Convert to int16 as required by TEN-VAD
-        if audio.dtype != np.int16:
-            audio_int16 = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
-        else:
-            audio_int16 = audio
-        # Process frame by frame
-        hop_size = cls.VAD_HOP_SIZE
-        frame_duration = hop_size / sample_rate
-        speech_frames: list[bool] = []
-        for i in range(0, len(audio_int16) - hop_size, hop_size):
-            frame = audio_int16[i : i + hop_size]
-            _, is_speech = vad_model.process(frame)
-            speech_frames.append(is_speech)
-        # Convert frame-level decisions to segments
-        segments: list[tuple[float, float]] = []
-        in_speech = False
-        start_idx = 0
-        for i, is_speech in enumerate(speech_frames):
-            if is_speech and not in_speech:
-                start_idx = i
-                in_speech = True
-            elif not is_speech and in_speech:
-                start_time = start_idx * frame_duration
-                end_time = i * frame_duration
-                segments.append((start_time, end_time))
-                in_speech = False
-        # Handle trailing speech
-        if in_speech:
-            start_time = start_idx * frame_duration
-            end_time = len(speech_frames) * frame_duration
-            segments.append((start_time, end_time))
-        # Merge segments with small gaps
-        return cls._merge_speech_segments(segments)
-    @classmethod
-    def _merge_speech_segments(
-        cls, segments: list[tuple[float, float]]
-    ) -> list[tuple[float, float]]:
-        """Merge speech segments with small gaps."""
-        if not segments:
-            return segments
-        merged: list[tuple[float, float]] = [segments[0]]
-        for start, end in segments[1:]:
-            prev_start, prev_end = merged[-1]
-            if start - prev_end <= cls.VAD_MAX_GAP:
-                merged[-1] = (prev_start, end)
-            else:
-                merged.append((start, end))
-        return merged
-    @classmethod
-    def _is_in_speech(cls, time: float, speech_regions: list[tuple[float, float]]) -> bool:
-        """Check if a timestamp falls within any speech region."""
-        return any(start <= time <= end for start, end in speech_regions)
-    @classmethod
-    def _find_nearest_speech_boundary(
-        cls, time: float, speech_regions: list[tuple[float, float]], direction: str = "any"
-    ) -> float:
-        """Find the nearest speech region boundary to a timestamp.
-        Args:
-            time: Timestamp to find boundary for
-            speech_regions: List of (start, end) speech regions
-            direction: "start" for word starts, "end" for word ends, "any" for closest
-        Returns:
-            Adjusted timestamp snapped to nearest speech boundary
-        """
-        if not speech_regions:
-            return time
-        best_time = time
-        min_dist = float("inf")
-        for start, end in speech_regions:
-            # If time is inside this region, return as-is
-            if start <= time <= end:
-                return time
-            # Check distance to boundaries
-            if direction in ("start", "any"):
-                dist = abs(time - start)
-                if dist < min_dist:
-                    min_dist = dist
-                    best_time = start
-            if direction in ("end", "any"):
-                dist = abs(time - end)
-                if dist < min_dist:
-                    min_dist = dist
-                    best_time = end
-        return best_time
-    # Confidence threshold for alignment scores (log probability)
-    MIN_CONFIDENCE = -5.0  # Tokens with scores below this are considered low-confidence
     @classmethod
     def align(
         cls,
@@ -194,7 +59,6 @@ class ForcedAligner:
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
-        use_vad: bool = True,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
@@ -204,10 +68,9 @@ class ForcedAligner:
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
-            use_vad: If True, use VAD to refine word boundaries (default True)
         Returns:
-            List of dicts with 'word', 'start', 'end', 'confidence' keys
         """
         import torchaudio
         from torchaudio.functional import forced_align, merge_tokens
@@ -215,11 +78,6 @@ class ForcedAligner:
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
-        # Step 1: Get speech regions using VAD (before any processing)
-        speech_regions = []
-        if use_vad:
-            speech_regions = cls._get_speech_regions(audio, sample_rate)
         # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
             waveform = torch.from_numpy(audio.copy()).float()
@@ -272,122 +130,43 @@ class ForcedAligner:
         frame_duration = 320 / cls._bundle.sample_rate
         # Group token spans into words based on pipe separator
-        # Track confidence scores per word
         words = text.split()
         word_timestamps = []
         current_word_start = None
         current_word_end = None
-        current_word_scores: list[float] = []
         word_idx = 0
         for span in token_spans:
             token_char = labels[span.token]
             if token_char == "|":  # Word separator
                 if current_word_start is not None and word_idx < len(words):
-                    # Calculate word confidence as mean of token scores
-                    confidence = (
-                        sum(current_word_scores) / len(current_word_scores)
-                        if current_word_scores
-                        else 0.0
-                    )
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
                             "start": current_word_start * frame_duration,
                             "end": current_word_end * frame_duration,
-                            "confidence": confidence,
                         }
                     )
                     word_idx += 1
                 current_word_start = None
                 current_word_end = None
-                current_word_scores = []
             else:
                 if current_word_start is None:
                     current_word_start = span.start
                 current_word_end = span.end
-                current_word_scores.append(span.score)
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
-            confidence = (
-                sum(current_word_scores) / len(current_word_scores) if current_word_scores else 0.0
-            )
             word_timestamps.append(
                 {
                     "word": words[word_idx],
                     "start": current_word_start * frame_duration,
                     "end": current_word_end * frame_duration,
-                    "confidence": confidence,
                 }
             )
-        # Step 2: Refine timestamps using VAD
-        if use_vad and speech_regions:
-            word_timestamps = cls._refine_with_vad(word_timestamps, speech_regions)
         return word_timestamps
-    @classmethod
-    def _refine_with_vad(
-        cls, word_timestamps: list[dict], speech_regions: list[tuple[float, float]]
-    ) -> list[dict]:
-        """Refine word timestamps using VAD speech regions.
-        - Words with low confidence that fall outside speech regions are flagged
-        - Word boundaries are snapped to speech region boundaries when close
-        Args:
-            word_timestamps: List of word dicts with 'start', 'end', 'confidence'
-            speech_regions: List of (start, end) speech regions
-        Returns:
-            Refined word timestamps
-        """
-        if not word_timestamps or not speech_regions:
-            return word_timestamps
-        refined = []
-        for word in word_timestamps:
-            start = word["start"]
-            end = word["end"]
-            confidence = word.get("confidence", 0.0)
-            # Check if word midpoint is in a speech region
-            midpoint = (start + end) / 2
-            in_speech = cls._is_in_speech(midpoint, speech_regions)
-            # For low-confidence words outside speech, snap to nearest speech boundary
-            if not in_speech and confidence < cls.MIN_CONFIDENCE:
-                # Find the nearest speech region and snap boundaries
-                start = cls._find_nearest_speech_boundary(start, speech_regions, "start")
-                end = cls._find_nearest_speech_boundary(end, speech_regions, "end")
-                # Ensure start < end
-                if start >= end:
-                    end = start + 0.01
-            # For words near speech boundaries, snap to the boundary
-            # This helps align word edges with actual speech onset/offset
-            snap_threshold = 0.05  # 50ms
-            for region_start, region_end in speech_regions:
-                # Snap start to speech region start if close
-                if abs(start - region_start) < snap_threshold:
-                    start = region_start
-                # Snap end to speech region end if close
-                if abs(end - region_end) < snap_threshold:
-                    end = region_end
-            refined.append(
-                {
-                    "word": word["word"],
-                    "start": start,
-                    "end": end,
-                    "confidence": confidence,
-                }
-            )
-        return refined
 try:
     from .diarization import SpeakerDiarizer

     _model = None
     _labels = None
     _dictionary = None
     @classmethod
     def get_instance(cls, device: str = "cuda"):
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
     @classmethod
     def align(
         cls,
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
         Returns:
+            List of dicts with 'word', 'start', 'end' keys
         """
         import torchaudio
         from torchaudio.functional import forced_align, merge_tokens
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
         # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
             waveform = torch.from_numpy(audio.copy()).float()
         frame_duration = 320 / cls._bundle.sample_rate
         # Group token spans into words based on pipe separator
         words = text.split()
         word_timestamps = []
         current_word_start = None
         current_word_end = None
         word_idx = 0
         for span in token_spans:
             token_char = labels[span.token]
             if token_char == "|":  # Word separator
                 if current_word_start is not None and word_idx < len(words):
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
                             "start": current_word_start * frame_duration,
                             "end": current_word_end * frame_duration,
                         }
                     )
                     word_idx += 1
                 current_word_start = None
                 current_word_end = None
             else:
                 if current_word_start is None:
                     current_word_start = span.start
                 current_word_end = span.end
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
             word_timestamps.append(
                 {
                     "word": words[word_idx],
                     "start": current_word_start * frame_duration,
                     "end": current_word_end * frame_duration,
                 }
             )
         return word_timestamps
 try:
     from .diarization import SpeakerDiarizer