Spaces:

ConvxO2
/

Who-Spoke-When

Running

ConvxO2 commited on 6 days ago

Commit

789006e

1 Parent(s): 411e5d6

Reduce speaker over-segmentation in auto clustering

Files changed (3) hide show

app/main.py CHANGED Viewed

@@ -74,7 +74,7 @@ def get_pipeline():
             use_pyannote_diarization=os.getenv("USE_PYANNOTE_DIARIZATION", "true").lower() in {"1", "true", "yes"},
             pyannote_diarization_model=os.getenv("PYANNOTE_DIARIZATION_MODEL", "pyannote/speaker-diarization-3.1"),
             hf_token=os.getenv("HF_TOKEN"),
-            max_speakers=10,
             cache_dir=cache_dir,
         )
     return _pipeline
@@ -287,3 +287,4 @@ if static_dir.exists():
     app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")

             use_pyannote_diarization=os.getenv("USE_PYANNOTE_DIARIZATION", "true").lower() in {"1", "true", "yes"},
             pyannote_diarization_model=os.getenv("PYANNOTE_DIARIZATION_MODEL", "pyannote/speaker-diarization-3.1"),
             hf_token=os.getenv("HF_TOKEN"),
+            max_speakers=int(os.getenv("MAX_SPEAKERS", "6")),
             cache_dir=cache_dir,
         )
     return _pipeline
     app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")

app/pipeline.py CHANGED Viewed

@@ -73,7 +73,7 @@ class DiarizationPipeline:
         pyannote_diarization_model: str = "pyannote/speaker-diarization-3.1",
         hf_token: Optional[str] = None,
         num_speakers: Optional[int] = None,
-        max_speakers: int = 10,
         cache_dir: str = "./model_cache",
     ):
         self.device = self._resolve_device(device)
@@ -411,3 +411,4 @@ class DiarizationPipeline:
             processing_time=processing_time,
             sample_rate=sample_rate,
         )

         pyannote_diarization_model: str = "pyannote/speaker-diarization-3.1",
         hf_token: Optional[str] = None,
         num_speakers: Optional[int] = None,
+        max_speakers: int = 6,
         cache_dir: str = "./model_cache",
     ):
         self.device = self._resolve_device(device)
             processing_time=processing_time,
             sample_rate=sample_rate,
         )

models/clusterer.py CHANGED Viewed

@@ -39,11 +39,13 @@ class SpeakerClusterer:
         if n <= 2:
             return n
-        best_k = max(2, self.min_speakers)
-        best_score = -1.0
         upper_k = min(self.max_speakers, n - 1)
-        for k in range(max(2, self.min_speakers), upper_k + 1):
             labels = fcluster(linkage_matrix, k, criterion="maxclust")
             if len(np.unique(labels)) < 2:
                 continue
@@ -63,10 +65,13 @@ class SpeakerClusterer:
         k_threshold = len(np.unique(threshold_labels))
         k_threshold = int(np.clip(k_threshold, self.min_speakers, min(self.max_speakers, n)))
         if best_score < 0.08:
             chosen_k = k_threshold
         else:
-            chosen_k = max(best_k, k_threshold)
         logger.info(
             f"Optimal speaker count: {chosen_k} "

         if n <= 2:
             return n
+        min_k = max(2, self.min_speakers)
         upper_k = min(self.max_speakers, n - 1)
+        best_k = min_k
+        best_score = -1.0
+        for k in range(min_k, upper_k + 1):
             labels = fcluster(linkage_matrix, k, criterion="maxclust")
             if len(np.unique(labels)) < 2:
                 continue
         k_threshold = len(np.unique(threshold_labels))
         k_threshold = int(np.clip(k_threshold, self.min_speakers, min(self.max_speakers, n)))
+        # Be conservative to avoid severe over-segmentation in open-domain audio.
         if best_score < 0.08:
             chosen_k = k_threshold
         else:
+            chosen_k = min(best_k, k_threshold) if k_threshold >= 2 else best_k
+        chosen_k = int(np.clip(chosen_k, self.min_speakers, min(self.max_speakers, n)))
         logger.info(
             f"Optimal speaker count: {chosen_k} "