KIFF
/

pyannote-speaker-diarization-endpoint

@@ -1,49 +1,72 @@
-import os
-from pyannote.audio import Pipeline, Audio
 import torch
-class EndpointHandler:
     def __init__(self, path=""):
-        # Get the Hugging Face authentication token from the environment variable
-        auth_token = os.getenv("MY_KEY")
-        if not auth_token:
             raise ValueError("Hugging Face authentication token (MY_KEY) is missing.")
-        # Initialize pretrained pipeline with the token
-        self._pipeline = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.1", use_auth_token=auth_token
         )
-        # Send pipeline to GPU if available
-        if torch.cuda.is_available():
-            self._pipeline.to(torch.device("cuda"))
-        # Initialize audio reader
-        self._io = Audio()
-    def __call__(self, data):
-        # Extract inputs from request data
-        inputs = data.pop("inputs", data)
-        waveform, sample_rate = self._io(inputs)
-        # Extract pipeline parameters if provided
-        parameters = data.pop("parameters", dict())
-        # Run speaker diarization
-        diarization = self._pipeline(
-            {"waveform": waveform, "sample_rate": sample_rate}, **parameters
-        )
-        # Process diarization results
         processed_diarization = [
             {
-                "speaker": speaker,
-                "start": f"{turn.start:.3f}",
-                "end": f"{turn.end:.3f}",
             }
-            for turn, _, speaker in diarization.itertracks(yield_label=True)
         ]
-        # Return results as JSON
         return {"diarization": processed_diarization}

+from typing import Dict
+from pyannote.audio import Pipeline
 import torch
+import base64
+import numpy as np
+import os
+SAMPLE_RATE = 16000
+class EndpointHandler():
     def __init__(self, path=""):
+        # Retrieve the Hugging Face authentication token from the environment variable
+        hf_token = os.getenv("MY_KEY")
+        if not hf_token:
             raise ValueError("Hugging Face authentication token (MY_KEY) is missing.")
+        # Initialize the pipeline with the authentication token
+        self.pipeline = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1", use_auth_token=hf_token
         )
+        # Move the pipeline to the appropriate device (CPU or GPU)
+        self.pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+        # Instantiate the pipeline with its parameters
+        self.pipeline = self.pipeline.instantiate(self.pipeline.parameters)
+    def __call__(self, data: Dict) -> Dict:
+        """
+        Args:
+            data (Dict):
+                'inputs': Base64-encoded audio bytes
+                'parameters': Additional diarization parameters (currently unused)
+        Return:
+            Dict: Speaker diarization results
+        """
+        inputs = data.get("inputs")
+        parameters = data.get("parameters", {})  # We are not using them now
+        # Decode the base64 audio data
+        audio_data = base64.b64decode(inputs)
+        audio_nparray = np.frombuffer(audio_data, dtype=np.int16)
+        # Handle multi-channel audio (convert to mono)
+        if audio_nparray.ndim > 1:
+            audio_nparray = audio_nparray.mean(axis=0)  # Average channels to create mono
+        # Convert to PyTorch tensor
+        audio_tensor = torch.from_numpy(audio_nparray).float().unsqueeze(0)
+        if audio_tensor.dim() == 1:
+            audio_tensor = audio_tensor.unsqueeze(0)
+        pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}
+        # Run diarization pipeline
+        try:
+            diarization = self.pipeline(pyannote_input)  # No num_speakers parameter
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+            return {"error": "Diarization failed unexpectedly"}
+        # Build a friendly JSON response
         processed_diarization = [
             {
+                "label": str(label),
+                "start": str(segment.start),
+                "stop": str(segment.end),
             }
+            for segment, _, label in diarization.itertracks(yield_label=True)
         ]
         return {"diarization": processed_diarization}