| """AudioSet -> Whisper-model routing. |
| |
| Maps the top-1 AudioSet display name produced by the MIT AST AudioSet |
| classifier (MIT/ast-finetuned-audioset-10-10-0.4593) to one of three |
| routes: |
| |
| "speech" -> laion/voice-tagging-whisper + laion/BUD-E-Whisper_V1.2 |
| "music" -> laion/music-whisper |
| "sfx" -> laion/sound-effect-captioning-whisper (default / fallback) |
| |
| The class lists below are derived from the official AudioSet ontology |
| (https://github.com/audioset/ontology). The "speech" set covers the |
| `/m/09x0r Speech` subtree plus other linguistic / vocal-utterance classes |
| under `Human voice` (Shout, Scream, Whisper, Laughter, Crying, Sigh, Groan, |
| Grunt) and the `Human group actions` classes that denote crowds of people |
| talking (Chatter, Crowd, Hubbub, Children playing). |
| |
| The "music" set is the entire `/m/04rlf Music` subtree (instruments, |
| genres, music-mood classes) plus the `Singing` subtree (Singing, Choir, |
| Yodeling, Chant, Mantra, Male/Female/Child/Synthetic singing, Rapping, |
| Humming) which technically also lives under `Human voice` but is musical |
| in nature and best handled by the music captioning model. |
| |
| Everything else (animals, environmental sounds, vehicles, household |
| objects, explosions, motion, ambiguous mixtures, ...) is routed to the |
| general-purpose sound-effect captioning model. |
| """ |
|
|
| from __future__ import annotations |
|
|
| |
| |
| |
| SPEECH_LABELS: frozenset[str] = frozenset({ |
| |
| "Speech", |
| "Male speech, man speaking", |
| "Female speech, woman speaking", |
| "Child speech, kid speaking", |
| "Conversation", |
| "Narration, monologue", |
| "Babbling", |
| "Speech synthesizer", |
|
|
| |
| "Shout", |
| "Bellow", |
| "Whoop", |
| "Yell", |
| "Battle cry", |
| "Children shouting", |
| "Screaming", |
| "Whispering", |
|
|
| |
| "Laughter", |
| "Baby laughter", |
| "Giggle", |
| "Snicker", |
| "Belly laugh", |
| "Chuckle, chortle", |
|
|
| |
| "Crying, sobbing", |
| "Baby cry, infant cry", |
| "Whimper", |
| "Wail, moan", |
|
|
| |
| "Sigh", |
| "Groan", |
| "Grunt", |
|
|
| |
| "Chatter", |
| "Crowd", |
| "Hubbub, speech noise, speech babble", |
| "Children playing", |
| }) |
|
|
|
|
| |
| |
| |
| MUSIC_LABELS: frozenset[str] = frozenset({ |
| |
| "Singing", |
| "Choir", |
| "Yodeling", |
| "Chant", |
| "Mantra", |
| "Male singing", |
| "Female singing", |
| "Child singing", |
| "Synthetic singing", |
| "Rapping", |
| "Humming", |
|
|
| |
| "Music", |
| "Musical instrument", |
| "Plucked string instrument", |
| "Guitar", |
| "Electric guitar", |
| "Bass guitar", |
| "Acoustic guitar", |
| "Steel guitar, slide guitar", |
| "Tapping (guitar technique)", |
| "Strum", |
| "Banjo", |
| "Sitar", |
| "Mandolin", |
| "Zither", |
| "Ukulele", |
| "Keyboard (musical)", |
| "Piano", |
| "Electric piano", |
| "Organ", |
| "Electronic organ", |
| "Hammond organ", |
| "Synthesizer", |
| "Sampler", |
| "Harpsichord", |
| "Percussion", |
| "Drum kit", |
| "Drum machine", |
| "Drum", |
| "Snare drum", |
| "Rimshot", |
| "Drum roll", |
| "Bass drum", |
| "Timpani", |
| "Tabla", |
| "Cymbal", |
| "Hi-hat", |
| "Wood block", |
| "Tambourine", |
| "Rattle (instrument)", |
| "Maraca", |
| "Gong", |
| "Tubular bells", |
| "Mallet percussion", |
| "Marimba, xylophone", |
| "Glockenspiel", |
| "Vibraphone", |
| "Steelpan", |
| "Cowbell", |
| "Orchestra", |
| "Brass instrument", |
| "French horn", |
| "Trumpet", |
| "Trombone", |
| "Bowed string instrument", |
| "String section", |
| "Violin, fiddle", |
| "Pizzicato", |
| "Cello", |
| "Double bass", |
| "Wind instrument, woodwind instrument", |
| "Flute", |
| "Saxophone", |
| "Clarinet", |
| "Harp", |
| "Bell", |
| "Church bell", |
| "Jingle bell", |
| "Bicycle bell", |
| "Tuning fork", |
| "Chime", |
| "Wind chime", |
| "Change ringing (campanology)", |
| "Harmonica", |
| "Accordion", |
| "Bagpipes", |
| "Didgeridoo", |
| "Shofar", |
| "Theremin", |
| "Singing bowl", |
| "Scratching (performance technique)", |
|
|
| |
| "Pop music", |
| "Hip hop music", |
| "Beatboxing", |
| "Rock music", |
| "Heavy metal", |
| "Punk rock", |
| "Grunge", |
| "Progressive rock", |
| "Rock and roll", |
| "Psychedelic rock", |
| "Rhythm and blues", |
| "Soul music", |
| "Reggae", |
| "Country", |
| "Swing music", |
| "Bluegrass", |
| "Funk", |
| "Folk music", |
| "Middle Eastern music", |
| "Jazz", |
| "Disco", |
| "Classical music", |
| "Opera", |
| "Electronic music", |
| "House music", |
| "Techno", |
| "Dubstep", |
| "Drum and bass", |
| "Electronica", |
| "Electronic dance music", |
| "Ambient music", |
| "Trance music", |
| "Music of Latin America", |
| "Salsa music", |
| "Flamenco", |
| "Blues", |
| "Music for children", |
| "New-age music", |
| "Vocal music", |
| "A capella", |
| "Music of Africa", |
| "Afrobeat", |
| "Christian music", |
| "Gospel music", |
| "Music of Asia", |
| "Carnatic music", |
| "Music of Bollywood", |
| "Ska", |
| "Traditional music", |
| "Independent music", |
|
|
| |
| "Song", |
| "Background music", |
| "Theme music", |
| "Jingle (music)", |
| "Soundtrack music", |
| "Lullaby", |
| "Video game music", |
| "Christmas music", |
| "Dance music", |
| "Wedding music", |
| "Happy music", |
| "Funny music", |
| "Sad music", |
| "Tender music", |
| "Exciting music", |
| "Angry music", |
| "Scary music", |
| }) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| SPEECH_TOP1_MIN_CONFIDENCE: float = 0.80 |
|
|
|
|
| def route(top_label: str, top_confidence: float | None = None) -> str: |
| """Return the routing target for an AudioSet display name. |
| |
| Decision rule: |
| - top label is the generic ``"Speech"`` class: |
| * confidence >= SPEECH_TOP1_MIN_CONFIDENCE -> "speech" |
| * confidence < SPEECH_TOP1_MIN_CONFIDENCE -> "sfx" |
| (handles AST false positives where a faint voice nudges |
| "Speech" to top-1 but the dominant sound is something else) |
| - top label is any other class in SPEECH_LABELS -> "speech" |
| (more specific labels like "Male speech, man speaking", |
| "Narration, monologue", "Laughter", "Sigh" are trusted at |
| any confidence) |
| - top label is in MUSIC_LABELS -> "music" |
| - everything else -> "sfx" (general sound effects) |
| |
| The decision is based on the *top-1* prediction only. If the model |
| is uncertain on something other than the generic "Speech" class |
| (e.g. mixed content where the top class is something like "Inside, |
| small room" or an animal sound), the file falls through to the |
| general-purpose sound effect captioning model, which is the safest, |
| most informative fallback. |
| |
| This routing heuristic is intentionally simple and **not perfect** -- |
| it works well in practice but could be improved with a small learned |
| classifier on top of the AST embeddings, or with more sophisticated |
| multi-class voting over the top-k predictions. |
| """ |
| if top_label == "Speech": |
| if top_confidence is not None and top_confidence < SPEECH_TOP1_MIN_CONFIDENCE: |
| return "sfx" |
| return "speech" |
| if top_label in SPEECH_LABELS: |
| return "speech" |
| if top_label in MUSIC_LABELS: |
| return "music" |
| return "sfx" |
|
|
|
|
| __all__ = ["SPEECH_LABELS", "MUSIC_LABELS", "SPEECH_TOP1_MIN_CONFIDENCE", "route"] |
|
|