| | from typing import Dict |
| | from pyannote.audio import Pipeline |
| | from transformers.pipelines.audio_utils import ffmpeg_read |
| | import torch |
| | from transformers.utils import logging |
| |
|
| | logger = logging.get_logger("MAXWELL") |
| |
|
| |
|
| | SAMPLE_RATE = 16000 |
| |
|
| |
|
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1") |
| |
|
| | def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| | """ |
| | Args: |
| | data (:obj:): |
| | includes the deserialized audio file as bytes |
| | Return: |
| | A :obj:`dict`:. base64 encoded image |
| | """ |
| | |
| | logger.warning("MAXWELL") |
| | logger.warning(data) |
| | inputs = data.pop("inputs", data) |
| | parameters = data.pop("parameters", None) |
| |
|
| | |
| | |
| | audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) |
| | audio_tensor= torch.from_numpy(audio_nparray).unsqueeze(0) |
| | pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE} |
| | |
| | |
| | |
| | if parameters is not None: |
| | diarization = self.pipeline(pyannote_input, **parameters) |
| | else: |
| | diarization = self.pipeline(pyannote_input) |
| |
|
| | |
| | processed_diarization = [ |
| | {"label": str(label), "start": str(segment.start), "stop": str(segment.end)} |
| | for segment, _, label in diarization.itertracks(yield_label=True) |
| | ] |
| | |
| | return {"diarization": processed_diarization} |
| |
|
| |
|