File size: 8,782 Bytes
8c838e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
# MIT License
#
# Copyright (c) 2022- CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from typing import Dict, Mapping, Optional, Tuple, Union
import numpy as np
from pyannote_audio_utils.core import Annotation, SlidingWindow, SlidingWindowFeature
from pyannote_audio_utils.core.utils.types import Label
from pyannote_audio_utils.metrics.diarization import DiarizationErrorRate
from pyannote_audio_utils.audio.core.inference import Inference
from pyannote_audio_utils.audio.utils.signal import Binarize
# TODO: move to dedicated module
class SpeakerDiarizationMixin:
"""Defines a bunch of methods common to speaker diarization pipelines"""
@staticmethod
def set_num_speakers(
num_speakers: Optional[int] = None,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
):
"""Validate number of speakers
Parameters
----------
num_speakers : int, optional
Number of speakers.
min_speakers : int, optional
Minimum number of speakers.
max_speakers : int, optional
Maximum number of speakers.
Returns
-------
num_speakers : int or None
min_speakers : int
max_speakers : int or np.inf
"""
# override {min|max}_num_speakers by num_speakers when available
min_speakers = num_speakers or min_speakers or 1
max_speakers = num_speakers or max_speakers or np.inf
if min_speakers > max_speakers:
raise ValueError(
f"min_speakers must be smaller than (or equal to) max_speakers "
f"(here: min_speakers={min_speakers:g} and max_speakers={max_speakers:g})."
)
if min_speakers == max_speakers:
num_speakers = min_speakers
return num_speakers, min_speakers, max_speakers
@staticmethod
def optimal_mapping(
reference: Union[Mapping, Annotation],
hypothesis: Annotation,
return_mapping: bool = False,
) -> Union[Annotation, Tuple[Annotation, Dict[Label, Label]]]:
"""Find the optimal bijective mapping between reference and hypothesis labels
Parameters
----------
reference : Annotation or Mapping
Reference annotation. Can be an Annotation instance or
a mapping with an "annotation" key.
hypothesis : Annotation
Hypothesized annotation.
return_mapping : bool, optional
Return the label mapping itself along with the mapped annotation. Defaults to False.
Returns
-------
mapped : Annotation
Hypothesis mapped to reference speakers.
mapping : dict, optional
Mapping between hypothesis (key) and reference (value) labels
Only returned if `return_mapping` is True.
"""
if isinstance(reference, Mapping):
reference = reference["annotation"]
annotated = reference["annotated"] if "annotated" in reference else None
else:
annotated = None
mapping = DiarizationErrorRate().optimal_mapping(
reference, hypothesis, uem=annotated
)
mapped_hypothesis = hypothesis.rename_labels(mapping=mapping)
if return_mapping:
return mapped_hypothesis, mapping
else:
return mapped_hypothesis
# TODO: get rid of warm-up parameter (trimming should be applied before calling speaker_count)
@staticmethod
def speaker_count(
binarized_segmentations: SlidingWindowFeature,
frames: SlidingWindow,
warm_up: Tuple[float, float] = (0.1, 0.1),
) -> SlidingWindowFeature:
"""Estimate frame-level number of instantaneous speakers
Parameters
----------
binarized_segmentations : SlidingWindowFeature
(num_chunks, num_frames, num_classes)-shaped binarized scores.
warm_up : (float, float) tuple, optional
Left/right warm up ratio of chunk duration.
Defaults to (0.1, 0.1), i.e. 10% on both sides.
frames : SlidingWindow
Frames resolution. Defaults to estimate it automatically based on
`segmentations` shape and chunk size. Providing the exact frame
resolution (when known) leads to better temporal precision.
Returns
-------
count : SlidingWindowFeature
(num_frames, 1)-shaped instantaneous speaker count
"""
trimmed = Inference.trim(binarized_segmentations, warm_up=warm_up)
count = Inference.aggregate(
np.sum(trimmed, axis=-1, keepdims=True),
frames,
hamming=False,
missing=0.0,
skip_average=False,
)
count.data = np.rint(count.data).astype(np.uint8)
return count
@staticmethod
def to_annotation(
discrete_diarization: SlidingWindowFeature,
min_duration_on: float = 0.0,
min_duration_off: float = 0.0,
) -> Annotation:
"""
Parameters
----------
discrete_diarization : SlidingWindowFeature
(num_frames, num_speakers)-shaped discrete diarization
min_duration_on : float, optional
Defaults to 0.
min_duration_off : float, optional
Defaults to 0.
Returns
-------
continuous_diarization : Annotation
Continuous diarization, with speaker labels as integers,
corresponding to the speaker indices in the discrete diarization.
"""
binarize = Binarize(
onset=0.5,
offset=0.5,
min_duration_on=min_duration_on,
min_duration_off=min_duration_off,
)
return binarize(discrete_diarization).rename_tracks(generator="string")
@staticmethod
def to_diarization(
segmentations: SlidingWindowFeature,
count: SlidingWindowFeature,
) -> SlidingWindowFeature:
"""Build diarization out of preprocessed segmentation and precomputed speaker count
Parameters
----------
segmentations : SlidingWindowFeature
(num_chunks, num_frames, num_speakers)-shaped segmentations
count : SlidingWindow_feature
(num_frames, 1)-shaped speaker count
Returns
-------
discrete_diarization : SlidingWindowFeature
Discrete (0s and 1s) diarization.
"""
# TODO: investigate alternative aggregation
activations = Inference.aggregate(
segmentations,
count.sliding_window,
hamming=False,
missing=0.0,
skip_average=True,
)
# shape is (num_frames, num_speakers)
_, num_speakers = activations.data.shape
max_speakers_per_frame = np.max(count.data)
if num_speakers < max_speakers_per_frame:
activations.data = np.pad(
activations.data, ((0, 0), (0, max_speakers_per_frame - num_speakers))
)
extent = activations.extent & count.extent
activations = activations.crop(extent, return_data=False)
count = count.crop(extent, return_data=False)
sorted_speakers = np.argsort(-activations, axis=-1)
binary = np.zeros_like(activations.data)
for t, ((_, c), speakers) in enumerate(zip(count, sorted_speakers)):
for i in range(c.item()):
binary[t, speakers[i]] = 1.0
return SlidingWindowFeature(binary, activations.sliding_window)
def classes(self):
speaker = 0
while True:
yield f"SPEAKER_{speaker:02d}"
speaker += 1
|