|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Dict, Mapping, Optional, Tuple, Union |
|
|
|
|
|
import numpy as np |
|
|
from pyannote_audio_utils.core import Annotation, SlidingWindow, SlidingWindowFeature |
|
|
from pyannote_audio_utils.core.utils.types import Label |
|
|
from pyannote_audio_utils.metrics.diarization import DiarizationErrorRate |
|
|
|
|
|
from pyannote_audio_utils.audio.core.inference import Inference |
|
|
from pyannote_audio_utils.audio.utils.signal import Binarize |
|
|
|
|
|
|
|
|
|
|
|
class SpeakerDiarizationMixin: |
|
|
"""Defines a bunch of methods common to speaker diarization pipelines""" |
|
|
|
|
|
@staticmethod |
|
|
def set_num_speakers( |
|
|
num_speakers: Optional[int] = None, |
|
|
min_speakers: Optional[int] = None, |
|
|
max_speakers: Optional[int] = None, |
|
|
): |
|
|
"""Validate number of speakers |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
num_speakers : int, optional |
|
|
Number of speakers. |
|
|
min_speakers : int, optional |
|
|
Minimum number of speakers. |
|
|
max_speakers : int, optional |
|
|
Maximum number of speakers. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
num_speakers : int or None |
|
|
min_speakers : int |
|
|
max_speakers : int or np.inf |
|
|
""" |
|
|
|
|
|
|
|
|
min_speakers = num_speakers or min_speakers or 1 |
|
|
max_speakers = num_speakers or max_speakers or np.inf |
|
|
|
|
|
if min_speakers > max_speakers: |
|
|
raise ValueError( |
|
|
f"min_speakers must be smaller than (or equal to) max_speakers " |
|
|
f"(here: min_speakers={min_speakers:g} and max_speakers={max_speakers:g})." |
|
|
) |
|
|
if min_speakers == max_speakers: |
|
|
num_speakers = min_speakers |
|
|
|
|
|
return num_speakers, min_speakers, max_speakers |
|
|
|
|
|
@staticmethod |
|
|
def optimal_mapping( |
|
|
reference: Union[Mapping, Annotation], |
|
|
hypothesis: Annotation, |
|
|
return_mapping: bool = False, |
|
|
) -> Union[Annotation, Tuple[Annotation, Dict[Label, Label]]]: |
|
|
"""Find the optimal bijective mapping between reference and hypothesis labels |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
reference : Annotation or Mapping |
|
|
Reference annotation. Can be an Annotation instance or |
|
|
a mapping with an "annotation" key. |
|
|
hypothesis : Annotation |
|
|
Hypothesized annotation. |
|
|
return_mapping : bool, optional |
|
|
Return the label mapping itself along with the mapped annotation. Defaults to False. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
mapped : Annotation |
|
|
Hypothesis mapped to reference speakers. |
|
|
mapping : dict, optional |
|
|
Mapping between hypothesis (key) and reference (value) labels |
|
|
Only returned if `return_mapping` is True. |
|
|
""" |
|
|
|
|
|
if isinstance(reference, Mapping): |
|
|
reference = reference["annotation"] |
|
|
annotated = reference["annotated"] if "annotated" in reference else None |
|
|
else: |
|
|
annotated = None |
|
|
|
|
|
mapping = DiarizationErrorRate().optimal_mapping( |
|
|
reference, hypothesis, uem=annotated |
|
|
) |
|
|
mapped_hypothesis = hypothesis.rename_labels(mapping=mapping) |
|
|
|
|
|
if return_mapping: |
|
|
return mapped_hypothesis, mapping |
|
|
|
|
|
else: |
|
|
return mapped_hypothesis |
|
|
|
|
|
|
|
|
@staticmethod |
|
|
def speaker_count( |
|
|
binarized_segmentations: SlidingWindowFeature, |
|
|
frames: SlidingWindow, |
|
|
warm_up: Tuple[float, float] = (0.1, 0.1), |
|
|
) -> SlidingWindowFeature: |
|
|
"""Estimate frame-level number of instantaneous speakers |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
binarized_segmentations : SlidingWindowFeature |
|
|
(num_chunks, num_frames, num_classes)-shaped binarized scores. |
|
|
warm_up : (float, float) tuple, optional |
|
|
Left/right warm up ratio of chunk duration. |
|
|
Defaults to (0.1, 0.1), i.e. 10% on both sides. |
|
|
frames : SlidingWindow |
|
|
Frames resolution. Defaults to estimate it automatically based on |
|
|
`segmentations` shape and chunk size. Providing the exact frame |
|
|
resolution (when known) leads to better temporal precision. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
count : SlidingWindowFeature |
|
|
(num_frames, 1)-shaped instantaneous speaker count |
|
|
""" |
|
|
|
|
|
trimmed = Inference.trim(binarized_segmentations, warm_up=warm_up) |
|
|
|
|
|
count = Inference.aggregate( |
|
|
np.sum(trimmed, axis=-1, keepdims=True), |
|
|
frames, |
|
|
hamming=False, |
|
|
missing=0.0, |
|
|
skip_average=False, |
|
|
) |
|
|
|
|
|
count.data = np.rint(count.data).astype(np.uint8) |
|
|
|
|
|
return count |
|
|
|
|
|
@staticmethod |
|
|
def to_annotation( |
|
|
discrete_diarization: SlidingWindowFeature, |
|
|
min_duration_on: float = 0.0, |
|
|
min_duration_off: float = 0.0, |
|
|
) -> Annotation: |
|
|
""" |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
discrete_diarization : SlidingWindowFeature |
|
|
(num_frames, num_speakers)-shaped discrete diarization |
|
|
min_duration_on : float, optional |
|
|
Defaults to 0. |
|
|
min_duration_off : float, optional |
|
|
Defaults to 0. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
continuous_diarization : Annotation |
|
|
Continuous diarization, with speaker labels as integers, |
|
|
corresponding to the speaker indices in the discrete diarization. |
|
|
""" |
|
|
|
|
|
binarize = Binarize( |
|
|
onset=0.5, |
|
|
offset=0.5, |
|
|
min_duration_on=min_duration_on, |
|
|
min_duration_off=min_duration_off, |
|
|
) |
|
|
|
|
|
return binarize(discrete_diarization).rename_tracks(generator="string") |
|
|
|
|
|
@staticmethod |
|
|
def to_diarization( |
|
|
segmentations: SlidingWindowFeature, |
|
|
count: SlidingWindowFeature, |
|
|
) -> SlidingWindowFeature: |
|
|
"""Build diarization out of preprocessed segmentation and precomputed speaker count |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
segmentations : SlidingWindowFeature |
|
|
(num_chunks, num_frames, num_speakers)-shaped segmentations |
|
|
count : SlidingWindow_feature |
|
|
(num_frames, 1)-shaped speaker count |
|
|
|
|
|
Returns |
|
|
------- |
|
|
discrete_diarization : SlidingWindowFeature |
|
|
Discrete (0s and 1s) diarization. |
|
|
""" |
|
|
|
|
|
|
|
|
activations = Inference.aggregate( |
|
|
segmentations, |
|
|
count.sliding_window, |
|
|
hamming=False, |
|
|
missing=0.0, |
|
|
skip_average=True, |
|
|
) |
|
|
|
|
|
|
|
|
_, num_speakers = activations.data.shape |
|
|
max_speakers_per_frame = np.max(count.data) |
|
|
if num_speakers < max_speakers_per_frame: |
|
|
activations.data = np.pad( |
|
|
activations.data, ((0, 0), (0, max_speakers_per_frame - num_speakers)) |
|
|
) |
|
|
|
|
|
extent = activations.extent & count.extent |
|
|
activations = activations.crop(extent, return_data=False) |
|
|
count = count.crop(extent, return_data=False) |
|
|
|
|
|
sorted_speakers = np.argsort(-activations, axis=-1) |
|
|
binary = np.zeros_like(activations.data) |
|
|
|
|
|
for t, ((_, c), speakers) in enumerate(zip(count, sorted_speakers)): |
|
|
for i in range(c.item()): |
|
|
binary[t, speakers[i]] = 1.0 |
|
|
|
|
|
return SlidingWindowFeature(binary, activations.sliding_window) |
|
|
|
|
|
def classes(self): |
|
|
speaker = 0 |
|
|
while True: |
|
|
yield f"SPEAKER_{speaker:02d}" |
|
|
speaker += 1 |
|
|
|