File size: 8,520 Bytes
b0a1d45 ae77cd8 b0a1d45 ae77cd8 b0a1d45 ae77cd8 b0a1d45 ae77cd8 b0a1d45 ae77cd8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | """AudioSet -> Whisper-model routing.
Maps the top-1 AudioSet display name produced by the MIT AST AudioSet
classifier (MIT/ast-finetuned-audioset-10-10-0.4593) to one of three
routes:
"speech" -> laion/voice-tagging-whisper + laion/BUD-E-Whisper_V1.2
"music" -> laion/music-whisper
"sfx" -> laion/sound-effect-captioning-whisper (default / fallback)
The class lists below are derived from the official AudioSet ontology
(https://github.com/audioset/ontology). The "speech" set covers the
`/m/09x0r Speech` subtree plus other linguistic / vocal-utterance classes
under `Human voice` (Shout, Scream, Whisper, Laughter, Crying, Sigh, Groan,
Grunt) and the `Human group actions` classes that denote crowds of people
talking (Chatter, Crowd, Hubbub, Children playing).
The "music" set is the entire `/m/04rlf Music` subtree (instruments,
genres, music-mood classes) plus the `Singing` subtree (Singing, Choir,
Yodeling, Chant, Mantra, Male/Female/Child/Synthetic singing, Rapping,
Humming) which technically also lives under `Human voice` but is musical
in nature and best handled by the music captioning model.
Everything else (animals, environmental sounds, vehicles, household
objects, explosions, motion, ambiguous mixtures, ...) is routed to the
general-purpose sound-effect captioning model.
"""
from __future__ import annotations
# ----------------------------------------------------------------------
# Speech-related AudioSet classes (route -> "speech")
# ----------------------------------------------------------------------
SPEECH_LABELS: frozenset[str] = frozenset({
# Speech subtree (/m/09x0r)
"Speech",
"Male speech, man speaking",
"Female speech, woman speaking",
"Child speech, kid speaking",
"Conversation",
"Narration, monologue",
"Babbling",
"Speech synthesizer",
# Shout / Yell / Scream / Whisper
"Shout",
"Bellow",
"Whoop",
"Yell",
"Battle cry",
"Children shouting",
"Screaming",
"Whispering",
# Laughter family
"Laughter",
"Baby laughter",
"Giggle",
"Snicker",
"Belly laugh",
"Chuckle, chortle",
# Crying family
"Crying, sobbing",
"Baby cry, infant cry",
"Whimper",
"Wail, moan",
# Other vocal utterances under Human voice (non-singing)
"Sigh",
"Groan",
"Grunt",
# Human group actions dominated by speech
"Chatter",
"Crowd",
"Hubbub, speech noise, speech babble",
"Children playing",
})
# ----------------------------------------------------------------------
# Music-related AudioSet classes (route -> "music")
# ----------------------------------------------------------------------
MUSIC_LABELS: frozenset[str] = frozenset({
# Singing subtree (lives under Human voice but is musical)
"Singing",
"Choir",
"Yodeling",
"Chant",
"Mantra",
"Male singing",
"Female singing",
"Child singing",
"Synthetic singing",
"Rapping",
"Humming",
# Music root + instrument categories (/m/04rlf)
"Music",
"Musical instrument",
"Plucked string instrument",
"Guitar",
"Electric guitar",
"Bass guitar",
"Acoustic guitar",
"Steel guitar, slide guitar",
"Tapping (guitar technique)",
"Strum",
"Banjo",
"Sitar",
"Mandolin",
"Zither",
"Ukulele",
"Keyboard (musical)",
"Piano",
"Electric piano",
"Organ",
"Electronic organ",
"Hammond organ",
"Synthesizer",
"Sampler",
"Harpsichord",
"Percussion",
"Drum kit",
"Drum machine",
"Drum",
"Snare drum",
"Rimshot",
"Drum roll",
"Bass drum",
"Timpani",
"Tabla",
"Cymbal",
"Hi-hat",
"Wood block",
"Tambourine",
"Rattle (instrument)",
"Maraca",
"Gong",
"Tubular bells",
"Mallet percussion",
"Marimba, xylophone",
"Glockenspiel",
"Vibraphone",
"Steelpan",
"Cowbell",
"Orchestra",
"Brass instrument",
"French horn",
"Trumpet",
"Trombone",
"Bowed string instrument",
"String section",
"Violin, fiddle",
"Pizzicato",
"Cello",
"Double bass",
"Wind instrument, woodwind instrument",
"Flute",
"Saxophone",
"Clarinet",
"Harp",
"Bell",
"Church bell",
"Jingle bell",
"Bicycle bell",
"Tuning fork",
"Chime",
"Wind chime",
"Change ringing (campanology)",
"Harmonica",
"Accordion",
"Bagpipes",
"Didgeridoo",
"Shofar",
"Theremin",
"Singing bowl",
"Scratching (performance technique)",
# Genres
"Pop music",
"Hip hop music",
"Beatboxing",
"Rock music",
"Heavy metal",
"Punk rock",
"Grunge",
"Progressive rock",
"Rock and roll",
"Psychedelic rock",
"Rhythm and blues",
"Soul music",
"Reggae",
"Country",
"Swing music",
"Bluegrass",
"Funk",
"Folk music",
"Middle Eastern music",
"Jazz",
"Disco",
"Classical music",
"Opera",
"Electronic music",
"House music",
"Techno",
"Dubstep",
"Drum and bass",
"Electronica",
"Electronic dance music",
"Ambient music",
"Trance music",
"Music of Latin America",
"Salsa music",
"Flamenco",
"Blues",
"Music for children",
"New-age music",
"Vocal music",
"A capella",
"Music of Africa",
"Afrobeat",
"Christian music",
"Gospel music",
"Music of Asia",
"Carnatic music",
"Music of Bollywood",
"Ska",
"Traditional music",
"Independent music",
# Music roles / moods
"Song",
"Background music",
"Theme music",
"Jingle (music)",
"Soundtrack music",
"Lullaby",
"Video game music",
"Christmas music",
"Dance music",
"Wedding music",
"Happy music",
"Funny music",
"Sad music",
"Tender music",
"Exciting music",
"Angry music",
"Scary music",
})
# Confidence threshold applied only when the top-1 AudioSet class is the
# generic catch-all "Speech" label. Empirically AST often returns "Speech"
# as top-1 with relatively low confidence on clips that are dominated by
# a sound effect (e.g. a car horn, a microwave beep) but happen to also
# contain a faint human voice. Requiring a fairly high confidence here
# eliminates most of these false positives at the cost of occasionally
# routing a quiet / ambiguous speech clip through the sound-effect
# captioner instead. This is just a rule of thumb -- the routing is not
# perfect and could be improved with a learned classifier on top of the
# AST embeddings.
SPEECH_TOP1_MIN_CONFIDENCE: float = 0.80
def route(top_label: str, top_confidence: float | None = None) -> str:
"""Return the routing target for an AudioSet display name.
Decision rule:
- top label is the generic ``"Speech"`` class:
* confidence >= SPEECH_TOP1_MIN_CONFIDENCE -> "speech"
* confidence < SPEECH_TOP1_MIN_CONFIDENCE -> "sfx"
(handles AST false positives where a faint voice nudges
"Speech" to top-1 but the dominant sound is something else)
- top label is any other class in SPEECH_LABELS -> "speech"
(more specific labels like "Male speech, man speaking",
"Narration, monologue", "Laughter", "Sigh" are trusted at
any confidence)
- top label is in MUSIC_LABELS -> "music"
- everything else -> "sfx" (general sound effects)
The decision is based on the *top-1* prediction only. If the model
is uncertain on something other than the generic "Speech" class
(e.g. mixed content where the top class is something like "Inside,
small room" or an animal sound), the file falls through to the
general-purpose sound effect captioning model, which is the safest,
most informative fallback.
This routing heuristic is intentionally simple and **not perfect** --
it works well in practice but could be improved with a small learned
classifier on top of the AST embeddings, or with more sophisticated
multi-class voting over the top-k predictions.
"""
if top_label == "Speech":
if top_confidence is not None and top_confidence < SPEECH_TOP1_MIN_CONFIDENCE:
return "sfx"
return "speech"
if top_label in SPEECH_LABELS:
return "speech"
if top_label in MUSIC_LABELS:
return "music"
return "sfx"
__all__ = ["SPEECH_LABELS", "MUSIC_LABELS", "SPEECH_TOP1_MIN_CONFIDENCE", "route"]
|