router.py · laion/whisper-captioning-ensemble at main

File size: 8,520 Bytes

"""AudioSet -> Whisper-model routing.

Maps the top-1 AudioSet display name produced by the MIT AST AudioSet
classifier (MIT/ast-finetuned-audioset-10-10-0.4593) to one of three
routes:

    "speech" -> laion/voice-tagging-whisper + laion/BUD-E-Whisper_V1.2
    "music"  -> laion/music-whisper
    "sfx"    -> laion/sound-effect-captioning-whisper   (default / fallback)

The class lists below are derived from the official AudioSet ontology
(https://github.com/audioset/ontology). The "speech" set covers the
`/m/09x0r Speech` subtree plus other linguistic / vocal-utterance classes
under `Human voice` (Shout, Scream, Whisper, Laughter, Crying, Sigh, Groan,
Grunt) and the `Human group actions` classes that denote crowds of people
talking (Chatter, Crowd, Hubbub, Children playing).

The "music" set is the entire `/m/04rlf Music` subtree (instruments,
genres, music-mood classes) plus the `Singing` subtree (Singing, Choir,
Yodeling, Chant, Mantra, Male/Female/Child/Synthetic singing, Rapping,
Humming) which technically also lives under `Human voice` but is musical
in nature and best handled by the music captioning model.

Everything else (animals, environmental sounds, vehicles, household
objects, explosions, motion, ambiguous mixtures, ...) is routed to the
general-purpose sound-effect captioning model.
"""

from __future__ import annotations

# ----------------------------------------------------------------------
# Speech-related AudioSet classes (route -> "speech")
# ----------------------------------------------------------------------
SPEECH_LABELS: frozenset[str] = frozenset({
    # Speech subtree (/m/09x0r)
    "Speech",
    "Male speech, man speaking",
    "Female speech, woman speaking",
    "Child speech, kid speaking",
    "Conversation",
    "Narration, monologue",
    "Babbling",
    "Speech synthesizer",

    # Shout / Yell / Scream / Whisper
    "Shout",
    "Bellow",
    "Whoop",
    "Yell",
    "Battle cry",
    "Children shouting",
    "Screaming",
    "Whispering",

    # Laughter family
    "Laughter",
    "Baby laughter",
    "Giggle",
    "Snicker",
    "Belly laugh",
    "Chuckle, chortle",

    # Crying family
    "Crying, sobbing",
    "Baby cry, infant cry",
    "Whimper",
    "Wail, moan",

    # Other vocal utterances under Human voice (non-singing)
    "Sigh",
    "Groan",
    "Grunt",

    # Human group actions dominated by speech
    "Chatter",
    "Crowd",
    "Hubbub, speech noise, speech babble",
    "Children playing",
})


# ----------------------------------------------------------------------
# Music-related AudioSet classes (route -> "music")
# ----------------------------------------------------------------------
MUSIC_LABELS: frozenset[str] = frozenset({
    # Singing subtree (lives under Human voice but is musical)
    "Singing",
    "Choir",
    "Yodeling",
    "Chant",
    "Mantra",
    "Male singing",
    "Female singing",
    "Child singing",
    "Synthetic singing",
    "Rapping",
    "Humming",

    # Music root + instrument categories (/m/04rlf)
    "Music",
    "Musical instrument",
    "Plucked string instrument",
    "Guitar",
    "Electric guitar",
    "Bass guitar",
    "Acoustic guitar",
    "Steel guitar, slide guitar",
    "Tapping (guitar technique)",
    "Strum",
    "Banjo",
    "Sitar",
    "Mandolin",
    "Zither",
    "Ukulele",
    "Keyboard (musical)",
    "Piano",
    "Electric piano",
    "Organ",
    "Electronic organ",
    "Hammond organ",
    "Synthesizer",
    "Sampler",
    "Harpsichord",
    "Percussion",
    "Drum kit",
    "Drum machine",
    "Drum",
    "Snare drum",
    "Rimshot",
    "Drum roll",
    "Bass drum",
    "Timpani",
    "Tabla",
    "Cymbal",
    "Hi-hat",
    "Wood block",
    "Tambourine",
    "Rattle (instrument)",
    "Maraca",
    "Gong",
    "Tubular bells",
    "Mallet percussion",
    "Marimba, xylophone",
    "Glockenspiel",
    "Vibraphone",
    "Steelpan",
    "Cowbell",
    "Orchestra",
    "Brass instrument",
    "French horn",
    "Trumpet",
    "Trombone",
    "Bowed string instrument",
    "String section",
    "Violin, fiddle",
    "Pizzicato",
    "Cello",
    "Double bass",
    "Wind instrument, woodwind instrument",
    "Flute",
    "Saxophone",
    "Clarinet",
    "Harp",
    "Bell",
    "Church bell",
    "Jingle bell",
    "Bicycle bell",
    "Tuning fork",
    "Chime",
    "Wind chime",
    "Change ringing (campanology)",
    "Harmonica",
    "Accordion",
    "Bagpipes",
    "Didgeridoo",
    "Shofar",
    "Theremin",
    "Singing bowl",
    "Scratching (performance technique)",

    # Genres
    "Pop music",
    "Hip hop music",
    "Beatboxing",
    "Rock music",
    "Heavy metal",
    "Punk rock",
    "Grunge",
    "Progressive rock",
    "Rock and roll",
    "Psychedelic rock",
    "Rhythm and blues",
    "Soul music",
    "Reggae",
    "Country",
    "Swing music",
    "Bluegrass",
    "Funk",
    "Folk music",
    "Middle Eastern music",
    "Jazz",
    "Disco",
    "Classical music",
    "Opera",
    "Electronic music",
    "House music",
    "Techno",
    "Dubstep",
    "Drum and bass",
    "Electronica",
    "Electronic dance music",
    "Ambient music",
    "Trance music",
    "Music of Latin America",
    "Salsa music",
    "Flamenco",
    "Blues",
    "Music for children",
    "New-age music",
    "Vocal music",
    "A capella",
    "Music of Africa",
    "Afrobeat",
    "Christian music",
    "Gospel music",
    "Music of Asia",
    "Carnatic music",
    "Music of Bollywood",
    "Ska",
    "Traditional music",
    "Independent music",

    # Music roles / moods
    "Song",
    "Background music",
    "Theme music",
    "Jingle (music)",
    "Soundtrack music",
    "Lullaby",
    "Video game music",
    "Christmas music",
    "Dance music",
    "Wedding music",
    "Happy music",
    "Funny music",
    "Sad music",
    "Tender music",
    "Exciting music",
    "Angry music",
    "Scary music",
})


# Confidence threshold applied only when the top-1 AudioSet class is the
# generic catch-all "Speech" label. Empirically AST often returns "Speech"
# as top-1 with relatively low confidence on clips that are dominated by
# a sound effect (e.g. a car horn, a microwave beep) but happen to also
# contain a faint human voice. Requiring a fairly high confidence here
# eliminates most of these false positives at the cost of occasionally
# routing a quiet / ambiguous speech clip through the sound-effect
# captioner instead. This is just a rule of thumb -- the routing is not
# perfect and could be improved with a learned classifier on top of the
# AST embeddings.
SPEECH_TOP1_MIN_CONFIDENCE: float = 0.80


def route(top_label: str, top_confidence: float | None = None) -> str:
    """Return the routing target for an AudioSet display name.

    Decision rule:
        - top label is the generic ``"Speech"`` class:
            * confidence >= SPEECH_TOP1_MIN_CONFIDENCE  -> "speech"
            * confidence  < SPEECH_TOP1_MIN_CONFIDENCE  -> "sfx"
              (handles AST false positives where a faint voice nudges
              "Speech" to top-1 but the dominant sound is something else)
        - top label is any other class in SPEECH_LABELS  -> "speech"
          (more specific labels like "Male speech, man speaking",
           "Narration, monologue", "Laughter", "Sigh" are trusted at
           any confidence)
        - top label is in MUSIC_LABELS  -> "music"
        - everything else               -> "sfx"  (general sound effects)

    The decision is based on the *top-1* prediction only. If the model
    is uncertain on something other than the generic "Speech" class
    (e.g. mixed content where the top class is something like "Inside,
    small room" or an animal sound), the file falls through to the
    general-purpose sound effect captioning model, which is the safest,
    most informative fallback.

    This routing heuristic is intentionally simple and **not perfect** --
    it works well in practice but could be improved with a small learned
    classifier on top of the AST embeddings, or with more sophisticated
    multi-class voting over the top-k predictions.
    """
    if top_label == "Speech":
        if top_confidence is not None and top_confidence < SPEECH_TOP1_MIN_CONFIDENCE:
            return "sfx"
        return "speech"
    if top_label in SPEECH_LABELS:
        return "speech"
    if top_label in MUSIC_LABELS:
        return "music"
    return "sfx"


__all__ = ["SPEECH_LABELS", "MUSIC_LABELS", "SPEECH_TOP1_MIN_CONFIDENCE", "route"]