mazesmazes
/

tiny-audio

@@ -30,6 +30,12 @@ class ForcedAligner:
     _model = None
     _labels = None
     _dictionary = None
     @classmethod
     def get_instance(cls, device: str = "cuda"):
@@ -51,6 +57,135 @@ class ForcedAligner:
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
     @classmethod
     def align(
         cls,
@@ -59,6 +194,7 @@ class ForcedAligner:
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
@@ -68,9 +204,10 @@ class ForcedAligner:
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
         Returns:
-            List of dicts with 'word', 'start', 'end' keys
         """
         import torchaudio
         from torchaudio.functional import forced_align, merge_tokens
@@ -78,6 +215,11 @@ class ForcedAligner:
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
         # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
             waveform = torch.from_numpy(audio.copy()).float()
@@ -130,43 +272,122 @@ class ForcedAligner:
         frame_duration = 320 / cls._bundle.sample_rate
         # Group token spans into words based on pipe separator
         words = text.split()
         word_timestamps = []
         current_word_start = None
         current_word_end = None
         word_idx = 0
         for span in token_spans:
             token_char = labels[span.token]
             if token_char == "|":  # Word separator
                 if current_word_start is not None and word_idx < len(words):
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
                             "start": current_word_start * frame_duration,
                             "end": current_word_end * frame_duration,
                         }
                     )
                     word_idx += 1
                 current_word_start = None
                 current_word_end = None
             else:
                 if current_word_start is None:
                     current_word_start = span.start
                 current_word_end = span.end
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
             word_timestamps.append(
                 {
                     "word": words[word_idx],
                     "start": current_word_start * frame_duration,
                     "end": current_word_end * frame_duration,
                 }
             )
         return word_timestamps
 try:
     from .diarization import SpeakerDiarizer

     _model = None
     _labels = None
     _dictionary = None
+    _vad_model = None
+    # VAD parameters
+    VAD_HOP_SIZE = 256  # TEN-VAD frame size (16ms at 16kHz)
+    VAD_THRESHOLD = 0.5  # Speech detection threshold
+    VAD_MAX_GAP = 0.15  # Max gap to merge speech segments (seconds)
     @classmethod
     def get_instance(cls, device: str = "cuda"):
             cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
         return cls._model, cls._labels, cls._dictionary
+    @classmethod
+    def _get_vad_model(cls):
+        """Lazy-load TEN-VAD model (singleton)."""
+        if cls._vad_model is None:
+            from ten_vad import TenVad
+            cls._vad_model = TenVad(hop_size=cls.VAD_HOP_SIZE, threshold=cls.VAD_THRESHOLD)
+        return cls._vad_model
+    @classmethod
+    def _get_speech_regions(
+        cls, audio: np.ndarray, sample_rate: int = 16000
+    ) -> list[tuple[float, float]]:
+        """Get speech regions using TEN-VAD.
+        Args:
+            audio: Audio waveform as numpy array
+            sample_rate: Audio sample rate
+        Returns:
+            List of (start_time, end_time) tuples for speech regions
+        """
+        vad_model = cls._get_vad_model()
+        # Convert to int16 as required by TEN-VAD
+        if audio.dtype != np.int16:
+            audio_int16 = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
+        else:
+            audio_int16 = audio
+        # Process frame by frame
+        hop_size = cls.VAD_HOP_SIZE
+        frame_duration = hop_size / sample_rate
+        speech_frames: list[bool] = []
+        for i in range(0, len(audio_int16) - hop_size, hop_size):
+            frame = audio_int16[i : i + hop_size]
+            _, is_speech = vad_model.process(frame)
+            speech_frames.append(is_speech)
+        # Convert frame-level decisions to segments
+        segments: list[tuple[float, float]] = []
+        in_speech = False
+        start_idx = 0
+        for i, is_speech in enumerate(speech_frames):
+            if is_speech and not in_speech:
+                start_idx = i
+                in_speech = True
+            elif not is_speech and in_speech:
+                start_time = start_idx * frame_duration
+                end_time = i * frame_duration
+                segments.append((start_time, end_time))
+                in_speech = False
+        # Handle trailing speech
+        if in_speech:
+            start_time = start_idx * frame_duration
+            end_time = len(speech_frames) * frame_duration
+            segments.append((start_time, end_time))
+        # Merge segments with small gaps
+        return cls._merge_speech_segments(segments)
+    @classmethod
+    def _merge_speech_segments(
+        cls, segments: list[tuple[float, float]]
+    ) -> list[tuple[float, float]]:
+        """Merge speech segments with small gaps."""
+        if not segments:
+            return segments
+        merged: list[tuple[float, float]] = [segments[0]]
+        for start, end in segments[1:]:
+            prev_start, prev_end = merged[-1]
+            if start - prev_end <= cls.VAD_MAX_GAP:
+                merged[-1] = (prev_start, end)
+            else:
+                merged.append((start, end))
+        return merged
+    @classmethod
+    def _is_in_speech(cls, time: float, speech_regions: list[tuple[float, float]]) -> bool:
+        """Check if a timestamp falls within any speech region."""
+        return any(start <= time <= end for start, end in speech_regions)
+    @classmethod
+    def _find_nearest_speech_boundary(
+        cls, time: float, speech_regions: list[tuple[float, float]], direction: str = "any"
+    ) -> float:
+        """Find the nearest speech region boundary to a timestamp.
+        Args:
+            time: Timestamp to find boundary for
+            speech_regions: List of (start, end) speech regions
+            direction: "start" for word starts, "end" for word ends, "any" for closest
+        Returns:
+            Adjusted timestamp snapped to nearest speech boundary
+        """
+        if not speech_regions:
+            return time
+        best_time = time
+        min_dist = float("inf")
+        for start, end in speech_regions:
+            # If time is inside this region, return as-is
+            if start <= time <= end:
+                return time
+            # Check distance to boundaries
+            if direction in ("start", "any"):
+                dist = abs(time - start)
+                if dist < min_dist:
+                    min_dist = dist
+                    best_time = start
+            if direction in ("end", "any"):
+                dist = abs(time - end)
+                if dist < min_dist:
+                    min_dist = dist
+                    best_time = end
+        return best_time
+    # Confidence threshold for alignment scores (log probability)
+    MIN_CONFIDENCE = -5.0  # Tokens with scores below this are considered low-confidence
     @classmethod
     def align(
         cls,
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
+        use_vad: bool = True,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
+            use_vad: If True, use VAD to refine word boundaries (default True)
         Returns:
+            List of dicts with 'word', 'start', 'end', 'confidence' keys
         """
         import torchaudio
         from torchaudio.functional import forced_align, merge_tokens
         device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
+        # Step 1: Get speech regions using VAD (before any processing)
+        speech_regions = []
+        if use_vad:
+            speech_regions = cls._get_speech_regions(audio, sample_rate)
         # Convert audio to tensor (copy to ensure array is writable)
         if isinstance(audio, np.ndarray):
             waveform = torch.from_numpy(audio.copy()).float()
         frame_duration = 320 / cls._bundle.sample_rate
         # Group token spans into words based on pipe separator
+        # Track confidence scores per word
         words = text.split()
         word_timestamps = []
         current_word_start = None
         current_word_end = None
+        current_word_scores: list[float] = []
         word_idx = 0
         for span in token_spans:
             token_char = labels[span.token]
             if token_char == "|":  # Word separator
                 if current_word_start is not None and word_idx < len(words):
+                    # Calculate word confidence as mean of token scores
+                    confidence = (
+                        sum(current_word_scores) / len(current_word_scores)
+                        if current_word_scores
+                        else 0.0
+                    )
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
                             "start": current_word_start * frame_duration,
                             "end": current_word_end * frame_duration,
+                            "confidence": confidence,
                         }
                     )
                     word_idx += 1
                 current_word_start = None
                 current_word_end = None
+                current_word_scores = []
             else:
                 if current_word_start is None:
                     current_word_start = span.start
                 current_word_end = span.end
+                current_word_scores.append(span.score)
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
+            confidence = (
+                sum(current_word_scores) / len(current_word_scores) if current_word_scores else 0.0
+            )
             word_timestamps.append(
                 {
                     "word": words[word_idx],
                     "start": current_word_start * frame_duration,
                     "end": current_word_end * frame_duration,
+                    "confidence": confidence,
                 }
             )
+        # Step 2: Refine timestamps using VAD
+        if use_vad and speech_regions:
+            word_timestamps = cls._refine_with_vad(word_timestamps, speech_regions)
         return word_timestamps
+    @classmethod
+    def _refine_with_vad(
+        cls, word_timestamps: list[dict], speech_regions: list[tuple[float, float]]
+    ) -> list[dict]:
+        """Refine word timestamps using VAD speech regions.
+        - Words with low confidence that fall outside speech regions are flagged
+        - Word boundaries are snapped to speech region boundaries when close
+        Args:
+            word_timestamps: List of word dicts with 'start', 'end', 'confidence'
+            speech_regions: List of (start, end) speech regions
+        Returns:
+            Refined word timestamps
+        """
+        if not word_timestamps or not speech_regions:
+            return word_timestamps
+        refined = []
+        for word in word_timestamps:
+            start = word["start"]
+            end = word["end"]
+            confidence = word.get("confidence", 0.0)
+            # Check if word midpoint is in a speech region
+            midpoint = (start + end) / 2
+            in_speech = cls._is_in_speech(midpoint, speech_regions)
+            # For low-confidence words outside speech, snap to nearest speech boundary
+            if not in_speech and confidence < cls.MIN_CONFIDENCE:
+                # Find the nearest speech region and snap boundaries
+                start = cls._find_nearest_speech_boundary(start, speech_regions, "start")
+                end = cls._find_nearest_speech_boundary(end, speech_regions, "end")
+                # Ensure start < end
+                if start >= end:
+                    end = start + 0.01
+            # For words near speech boundaries, snap to the boundary
+            # This helps align word edges with actual speech onset/offset
+            snap_threshold = 0.05  # 50ms
+            for region_start, region_end in speech_regions:
+                # Snap start to speech region start if close
+                if abs(start - region_start) < snap_threshold:
+                    start = region_start
+                # Snap end to speech region end if close
+                if abs(end - region_end) < snap_threshold:
+                    end = region_end
+            refined.append(
+                {
+                    "word": word["word"],
+                    "start": start,
+                    "end": end,
+                    "confidence": confidence,
+                }
+            )
+        return refined
 try:
     from .diarization import SpeakerDiarizer

diarization.py CHANGED Viewed

@@ -1,20 +1,18 @@
-"""Speaker diarization using TEN-VAD + WavLM + spectral clustering.
-Pipeline:
-1. TEN-VAD detects speech segments
-2. WavLM (microsoft/wavlm-base-plus-sv) extracts speaker embeddings
-3. Spectral clustering groups embeddings by speaker
 Spectral clustering implementation adapted from FunASR/3D-Speaker:
 https://github.com/alibaba-damo-academy/FunASR
 MIT License (https://opensource.org/licenses/MIT)
 """
 import numpy as np
 import scipy
 import sklearn.metrics.pairwise
 import torch
 from sklearn.cluster._kmeans import k_means
 def _get_device() -> torch.device:
@@ -71,23 +69,24 @@ class SpectralCluster:
         return sklearn.metrics.pairwise.cosine_similarity(embeddings, embeddings)
     def p_pruning(self, affinity: np.ndarray) -> np.ndarray:
-        """Prune low similarity values in affinity matrix."""
-        pval = 6.0 / affinity.shape[0] if affinity.shape[0] * self.pval < 6 else self.pval
-        n_elems = int((1 - pval) * affinity.shape[0])
-        # For each row in affinity matrix, zero out low similarities
-        for i in range(affinity.shape[0]):
-            low_indexes = np.argsort(affinity[i, :])
-            low_indexes = low_indexes[0:n_elems]
-            affinity[i, low_indexes] = 0
         return affinity
     def get_laplacian(self, sim_mat: np.ndarray) -> np.ndarray:
         """Compute unnormalized Laplacian matrix."""
-        sim_mat[np.diag_indices(sim_mat.shape[0])] = 0
-        degree = np.sum(np.abs(sim_mat), axis=1)
-        degree_mat = np.diag(degree)
-        return degree_mat - sim_mat
     def get_spec_embs(
         self, laplacian: np.ndarray, k_oracle: int | None = None
@@ -111,13 +110,9 @@ class SpectralCluster:
         _, labels, _ = k_means(emb, k, n_init=10)
         return labels
-    def get_eigen_gaps(self, eig_vals: np.ndarray) -> list[float]:
         """Compute gaps between consecutive eigenvalues."""
-        eig_vals_gap_list = []
-        for i in range(len(eig_vals) - 1):
-            gap = float(eig_vals[i + 1]) - float(eig_vals[i])
-            eig_vals_gap_list.append(gap)
-        return eig_vals_gap_list
 class SpeakerClusterer:
@@ -172,13 +167,9 @@ class SpeakerClusterer:
         if embeddings.shape[0] < 6:
             return np.zeros(embeddings.shape[0], dtype=int)
-        # Normalize embeddings
-        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
-        norms = np.maximum(norms, 1e-10)
-        embeddings = embeddings / norms
-        # Replace NaN/inf with zeros
         embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
         # Run spectral clustering (suppress numerical warnings)
         spectral = self._get_spectral_cluster()
@@ -208,49 +199,34 @@ class SpeakerClusterer:
     def _merge_by_cos(self, labels: np.ndarray, embs: np.ndarray, cos_thr: float) -> np.ndarray:
         """Merge similar speakers by cosine similarity of centroids."""
-        labels = labels.copy()
-        while True:
-            spk_num = labels.max() + 1
-            if spk_num == 1:
-                break
-            # Compute speaker centroids
-            spk_center = []
-            for i in range(spk_num):
-                spk_emb = embs[labels == i].mean(0)
-                spk_center.append(spk_emb)
-            if len(spk_center) == 0:
-                break
-            spk_center = np.stack(spk_center, axis=0)
-            norm_spk_center = spk_center / np.linalg.norm(spk_center, axis=1, keepdims=True)
-            affinity = np.matmul(norm_spk_center, norm_spk_center.T)
-            affinity = np.triu(affinity, 1)
-            # Find most similar pair
-            spks = np.unravel_index(np.argmax(affinity), affinity.shape)
-            if affinity[spks] < cos_thr:
-                break
-            # Merge speakers
-            for i in range(len(labels)):
-                if labels[i] == spks[1]:
-                    labels[i] = spks[0]
-                elif labels[i] > spks[1]:
-                    labels[i] -= 1
-        return labels
 class LocalSpeakerDiarizer:
-    """Local speaker diarization using TEN-VAD + WavLM + spectral clustering.
     Pipeline:
     1. TEN-VAD detects speech segments
     2. Sliding window (1.0s, 75% overlap) for uniform embedding extraction
-    3. WavLM extracts speaker embeddings per window
     4. Spectral clustering with eigenvalue gap for auto speaker detection
     5. Frame-level consensus voting for segment reconstruction
     6. Post-processing merges short segments to reduce flicker
@@ -269,15 +245,14 @@ class LocalSpeakerDiarizer:
     """
     _ten_vad_model = None
-    _speaker_model = None
     _device = None
     # ==================== TUNABLE PARAMETERS ====================
     # Sliding window for embedding extraction
-    # Longer windows (1.5-2.0s) capture more prosody, reducing speaker confusion
-    WINDOW_SIZE = 1.5  # seconds
-    STEP_SIZE = 0.5  # seconds (67% overlap)
     TAIL_COVERAGE_RATIO = 0.1  # Add extra window if tail > this ratio of window
     # VAD hysteresis parameters
@@ -291,8 +266,8 @@ class LocalSpeakerDiarizer:
     VOTING_RATE = 0.01  # 10ms resolution for consensus voting
     # Post-processing
-    MIN_SEGMENT_DURATION = 0.3  # Minimum final segment duration (seconds)
-    SHORT_SEGMENT_GAP = 0.3  # Gap threshold for merging short segments
     SAME_SPEAKER_GAP = 0.5  # Gap threshold for merging same-speaker segments
     # ===========================================================
@@ -314,21 +289,21 @@ class LocalSpeakerDiarizer:
         return cls._device
     @classmethod
-    def _get_speaker_model(cls):
-        """Lazy-load WavLM speaker embedding model (singleton)."""
-        if cls._speaker_model is None:
-            from transformers import WavLMForXVector
-            cls._speaker_model = WavLMForXVector.from_pretrained(
-                "microsoft/wavlm-base-plus-sv",
-            )
-            # Move model to best available device (MPS/CUDA/CPU)
-            device = cls._get_device()
-            cls._speaker_model = cls._speaker_model.to(device)
-            cls._speaker_model.eval()
-        return cls._speaker_model
     @classmethod
     def diarize(
@@ -382,10 +357,7 @@ class LocalSpeakerDiarizer:
         clusterer = SpeakerClusterer(min_num_spks=min_speakers, max_num_spks=max_speakers)
         labels = clusterer(embeddings, num_speakers)
-        # Step 4: Centroid refinement - reduces flickering/confusion
-        labels = cls._refine_with_centroids(embeddings, labels)
-        # Step 5: Post-process with consensus voting (VAD-aware)
         return cls._postprocess_segments(window_segments, labels, total_duration, vad_frames)
     @classmethod
@@ -483,64 +455,12 @@ class LocalSpeakerDiarizer:
         return filtered
-    @classmethod
-    def _refine_with_centroids(cls, embeddings: np.ndarray, labels: np.ndarray) -> np.ndarray:
-        """Refine cluster assignments using nearest centroid.
-        This reduces "flickering" where embeddings rapidly switch between speakers.
-        For each embedding, we re-assign it to the speaker whose centroid is closest
-        (by cosine similarity).
-        Args:
-            embeddings: Speaker embeddings of shape [N, D]
-            labels: Initial cluster labels of shape [N]
-        Returns:
-            Refined labels of shape [N]
-        """
-        if len(embeddings) == 0 or len(np.unique(labels)) <= 1:
-            return labels
-        # Normalize embeddings for cosine similarity
-        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
-        norms = np.maximum(norms, 1e-10)
-        norm_embeddings = embeddings / norms
-        # Calculate centroid for each speaker
-        unique_labels = np.unique(labels)
-        centroids = {}
-        for label in unique_labels:
-            mask = labels == label
-            speaker_embs = norm_embeddings[mask]
-            centroid = speaker_embs.mean(axis=0)
-            # Normalize centroid
-            centroid_norm = np.linalg.norm(centroid)
-            if centroid_norm > 1e-10:
-                centroids[label] = centroid / centroid_norm
-            else:
-                centroids[label] = centroid
-        # Re-assign each embedding to nearest centroid
-        refined_labels = np.zeros_like(labels)
-        for i, emb in enumerate(norm_embeddings):
-            best_label = labels[i]
-            best_sim = -1.0
-            for label, centroid in centroids.items():
-                sim = np.dot(emb, centroid)
-                if sim > best_sim:
-                    best_sim = sim
-                    best_label = label
-            refined_labels[i] = best_label
-        return refined_labels
     @classmethod
     def _extract_embeddings(
         cls, audio_array: np.ndarray, segments: list[dict], sample_rate: int
     ) -> tuple[np.ndarray, list[dict]]:
         """Extract speaker embeddings using sliding windows."""
-        speaker_model = cls._get_speaker_model()
-        device = cls._get_device()
         window_samples = int(cls.WINDOW_SIZE * sample_rate)
         step_samples = int(cls.STEP_SIZE * sample_rate)
@@ -577,17 +497,15 @@ class LocalSpeakerDiarizer:
                         pad_width = window_samples - len(chunk)
                         chunk = np.pad(chunk, (0, pad_width), mode="reflect")
-                    # Extract embedding (WavLMForXVector returns XVectorOutput with .embeddings)
-                    chunk_tensor = torch.from_numpy(chunk).float().unsqueeze(0).to(device)
-                    output = speaker_model(chunk_tensor)
-                    embedding = output.embeddings.squeeze(0).cpu().numpy()
-                    # Validate and normalize
-                    if not np.isfinite(embedding).all():
-                        continue
-                    norm = np.linalg.norm(embedding)
-                    if norm > 1e-8:
-                        embeddings.append(embedding / norm)
                         window_segments.append(
                             {
                                 "start": c_start / sample_rate,
@@ -595,8 +513,9 @@ class LocalSpeakerDiarizer:
                             }
                         )
         if embeddings:
-            return np.array(embeddings), window_segments
         return np.array([]), []
     @classmethod
@@ -611,15 +530,12 @@ class LocalSpeakerDiarizer:
             return np.zeros(num_frames, dtype=bool)
         vad_rate = 256 / 16000  # 16ms per VAD frame
-        result = np.zeros(num_frames, dtype=bool)
-        for i in range(num_frames):
-            voting_time = i * cls.VOTING_RATE
-            vad_frame = int(voting_time / vad_rate)
-            if vad_frame < len(vad_frames):
-                result[i] = vad_frames[vad_frame]
-        return result
     @classmethod
     def _postprocess_segments(
@@ -768,7 +684,7 @@ class LocalSpeakerDiarizer:
 class SpeakerDiarizer:
-    """Speaker diarization using TEN-VAD + WavLM + spectral clustering.
     Example:
         >>> segments = SpeakerDiarizer.diarize(audio_array)

+"""Speaker diarization using TEN-VAD + ECAPA-TDNN + spectral clustering.
 Spectral clustering implementation adapted from FunASR/3D-Speaker:
 https://github.com/alibaba-damo-academy/FunASR
 MIT License (https://opensource.org/licenses/MIT)
 """
+import warnings
 import numpy as np
 import scipy
 import sklearn.metrics.pairwise
 import torch
 from sklearn.cluster._kmeans import k_means
+from sklearn.preprocessing import normalize
 def _get_device() -> torch.device:
         return sklearn.metrics.pairwise.cosine_similarity(embeddings, embeddings)
     def p_pruning(self, affinity: np.ndarray) -> np.ndarray:
+        """Prune low similarity values in affinity matrix (keep top pval fraction)."""
+        n = affinity.shape[0]
+        pval = max(self.pval, 6.0 / n)
+        k_keep = max(1, int(pval * n))
+        # Vectorized: find top-k indices per row and zero out the rest
+        top_k_idx = np.argpartition(affinity, -k_keep, axis=1)[:, -k_keep:]
+        mask = np.zeros_like(affinity, dtype=bool)
+        np.put_along_axis(mask, top_k_idx, True, axis=1)
+        affinity[~mask] = 0
         return affinity
     def get_laplacian(self, sim_mat: np.ndarray) -> np.ndarray:
         """Compute unnormalized Laplacian matrix."""
+        from scipy.sparse.csgraph import laplacian
+        np.fill_diagonal(sim_mat, 0)
+        return laplacian(sim_mat, normed=False)
     def get_spec_embs(
         self, laplacian: np.ndarray, k_oracle: int | None = None
         _, labels, _ = k_means(emb, k, n_init=10)
         return labels
+    def get_eigen_gaps(self, eig_vals: np.ndarray) -> np.ndarray:
         """Compute gaps between consecutive eigenvalues."""
+        return np.diff(eig_vals)
 class SpeakerClusterer:
         if embeddings.shape[0] < 6:
             return np.zeros(embeddings.shape[0], dtype=int)
+        # Normalize embeddings and replace NaN/inf
         embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
+        embeddings = normalize(embeddings)
         # Run spectral clustering (suppress numerical warnings)
         spectral = self._get_spectral_cluster()
     def _merge_by_cos(self, labels: np.ndarray, embs: np.ndarray, cos_thr: float) -> np.ndarray:
         """Merge similar speakers by cosine similarity of centroids."""
+        from scipy.cluster.hierarchy import fcluster, linkage
+        from scipy.spatial.distance import pdist
+        unique_labels = np.unique(labels)
+        if len(unique_labels) <= 1:
+            return labels
+        # Compute normalized speaker centroids
+        centroids = np.array([embs[labels == lbl].mean(0) for lbl in unique_labels])
+        centroids = normalize(centroids)
+        # Hierarchical clustering with cosine distance
+        distances = pdist(centroids, metric="cosine")
+        linkage_matrix = linkage(distances, method="average")
+        merged_labels = fcluster(linkage_matrix, t=1.0 - cos_thr, criterion="distance") - 1
+        # Map original labels to merged labels
+        label_map = dict(zip(unique_labels, merged_labels))
+        return np.array([label_map[lbl] for lbl in labels])
 class LocalSpeakerDiarizer:
+    """Local speaker diarization using TEN-VAD + ECAPA-TDNN + spectral clustering.
     Pipeline:
     1. TEN-VAD detects speech segments
     2. Sliding window (1.0s, 75% overlap) for uniform embedding extraction
+    3. ECAPA-TDNN extracts speaker embeddings per window
     4. Spectral clustering with eigenvalue gap for auto speaker detection
     5. Frame-level consensus voting for segment reconstruction
     6. Post-processing merges short segments to reduce flicker
     """
     _ten_vad_model = None
+    _ecapa_model = None
     _device = None
     # ==================== TUNABLE PARAMETERS ====================
     # Sliding window for embedding extraction
+    WINDOW_SIZE = 0.75  # seconds - shorter window for finer resolution
+    STEP_SIZE = 0.15  # seconds (80% overlap for more votes)
     TAIL_COVERAGE_RATIO = 0.1  # Add extra window if tail > this ratio of window
     # VAD hysteresis parameters
     VOTING_RATE = 0.01  # 10ms resolution for consensus voting
     # Post-processing
+    MIN_SEGMENT_DURATION = 0.15  # Minimum final segment duration (seconds)
+    SHORT_SEGMENT_GAP = 0.1  # Gap threshold for merging short segments
     SAME_SPEAKER_GAP = 0.5  # Gap threshold for merging same-speaker segments
     # ===========================================================
         return cls._device
     @classmethod
+    def _get_ecapa_model(cls):
+        """Lazy-load ECAPA-TDNN speaker embedding model (singleton)."""
+        if cls._ecapa_model is None:
+            # Suppress torchaudio deprecation warning from SpeechBrain
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="torchaudio._backend")
+                from speechbrain.inference.speaker import EncoderClassifier
+                device = cls._get_device()
+                cls._ecapa_model = EncoderClassifier.from_hparams(
+                    source="speechbrain/spkrec-ecapa-voxceleb",
+                    run_opts={"device": str(device)},
+                )
+        return cls._ecapa_model
     @classmethod
     def diarize(
         clusterer = SpeakerClusterer(min_num_spks=min_speakers, max_num_spks=max_speakers)
         labels = clusterer(embeddings, num_speakers)
+        # Step 4: Post-process with consensus voting (VAD-aware)
         return cls._postprocess_segments(window_segments, labels, total_duration, vad_frames)
     @classmethod
         return filtered
     @classmethod
     def _extract_embeddings(
         cls, audio_array: np.ndarray, segments: list[dict], sample_rate: int
     ) -> tuple[np.ndarray, list[dict]]:
         """Extract speaker embeddings using sliding windows."""
+        speaker_model = cls._get_ecapa_model()
         window_samples = int(cls.WINDOW_SIZE * sample_rate)
         step_samples = int(cls.STEP_SIZE * sample_rate)
                         pad_width = window_samples - len(chunk)
                         chunk = np.pad(chunk, (0, pad_width), mode="reflect")
+                    # Extract embedding using SpeechBrain's encode_batch
+                    chunk_tensor = torch.from_numpy(chunk).float().unsqueeze(0)
+                    embedding = (
+                        speaker_model.encode_batch(chunk_tensor).squeeze(0).squeeze(0).cpu().numpy()
+                    )
+                    # Validate embedding
+                    if np.isfinite(embedding).all() and np.linalg.norm(embedding) > 1e-8:
+                        embeddings.append(embedding)
                         window_segments.append(
                             {
                                 "start": c_start / sample_rate,
                             }
                         )
+        # Normalize all embeddings at once
         if embeddings:
+            return normalize(np.array(embeddings)), window_segments
         return np.array([]), []
     @classmethod
             return np.zeros(num_frames, dtype=bool)
         vad_rate = 256 / 16000  # 16ms per VAD frame
+        vad_arr = np.array(vad_frames)
+        # Vectorized: compute VAD frame indices for each voting frame
+        voting_times = np.arange(num_frames) * cls.VOTING_RATE
+        vad_indices = np.clip((voting_times / vad_rate).astype(int), 0, len(vad_arr) - 1)
+        return vad_arr[vad_indices]
     @classmethod
     def _postprocess_segments(
 class SpeakerDiarizer:
+    """Speaker diarization using TEN-VAD + ECAPA-TDNN + spectral clustering.
     Example:
         >>> segments = SpeakerDiarizer.diarize(audio_array)