| """Render the sample annotations as a Markdown block to embed in README.md. |
| |
| Reads samples/{audioset,music,majestrino,freesound}/*.json and writes the |
| Markdown to samples/results_block.md. |
| |
| Audio is embedded with HTML5 <audio> tags pointing at the model repo's |
| `resolve/main` endpoint on Hugging Face, so the players actually work |
| when the README is rendered as the model card. |
| """ |
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| SAMPLES = ROOT / "samples" |
|
|
| |
| |
| |
| HF_REPO = "laion/whisper-captioning-ensemble" |
| HF_RESOLVE_BASE = f"https://huggingface.co/{HF_REPO}/resolve/main" |
|
|
| DATASETS = [ |
| { |
| "short": "audioset", |
| "label": "mitermix/audioset-with-grounded-captions", |
| "link": "https://huggingface.co/datasets/mitermix/audioset-with-grounded-captions", |
| "blurb": "AudioSet-derived clips with mixed content (speech, music, " |
| "sound effects) — a good test of all three routes.", |
| }, |
| { |
| "short": "music", |
| "label": "laion/captioned-ai-music-snippets", |
| "link": "https://huggingface.co/datasets/laion/captioned-ai-music-snippets", |
| "blurb": "AI-generated music snippets, primarily routed to the music " |
| "captioner.", |
| }, |
| { |
| "short": "majestrino", |
| "label": "TTS-AGI/majestrino-unified-detailed-captions-temporal", |
| "link": "https://huggingface.co/datasets/TTS-AGI/majestrino-unified-detailed-captions-temporal", |
| "blurb": "High-quality TTS-style speech recordings, primarily routed " |
| "to the speech models.", |
| }, |
| { |
| "short": "freesound", |
| "label": "laion/freesound-commercially-permissive-subset-with-captions", |
| "link": "https://huggingface.co/datasets/laion/freesound-commercially-permissive-subset-with-captions", |
| "blurb": "Curated commercially-permissive Freesound clips — 10 " |
| "examples that the AST router classified as something " |
| "other than speech or music, so they are routed through " |
| "the general-purpose sound-effect captioner.", |
| }, |
| ] |
|
|
|
|
| def _escape_md_emphasis(text: str) -> str: |
| """Escape characters that would otherwise toggle Markdown emphasis. |
| |
| Hugging Face renders blockquotes (`> ...`) in italic by default, so |
| a stray ``*`` or ``_`` inside a caption flips the italic off mid-text |
| and the rest of the paragraph appears in upright type. Backslash- |
| escaping these (and ``~`` for strikethrough) keeps the blockquote |
| italic from end to end. |
| """ |
| return ( |
| text.replace("\\", "\\\\") |
| .replace("*", "\\*") |
| .replace("_", "\\_") |
| .replace("~", "\\~") |
| ) |
|
|
|
|
| def _quote_caption(text: str) -> str: |
| """Render multi-line caption text as a single blockquote.""" |
| text = _escape_md_emphasis(text.strip()) |
| lines = text.split("\n") |
| return "\n".join(f"> {ln}" if ln.strip() else ">" for ln in lines) |
|
|
|
|
| def render_clip(item: dict) -> str: |
| payload = item["payload"] |
| audio = item["audio_url"] |
| ds = item["dataset"] |
|
|
| out: list[str] = [] |
| out.append(f"#### `{payload['audio_file']}`") |
| out.append("") |
| out.append( |
| f'<audio controls preload="none" src="{audio}"></audio>' |
| ) |
| out.append("") |
| out.append( |
| f"_Source dataset: [`{ds['label']}`]({ds['link']})_" |
| ) |
| out.append("") |
|
|
| out.append("**AudioSet top-3 predictions** (MIT AST):") |
| out.append("") |
| out.append("| # | Label | Confidence |") |
| out.append("|---|---|---|") |
| for i, top in enumerate(payload["audioset_top3"], 1): |
| pct = 100.0 * top["confidence"] |
| out.append(f"| {i} | `{top['label']}` | {pct:.1f}% |") |
| out.append("") |
| out.append(f"**Route:** `{payload['route']}`") |
| out.append("") |
|
|
| ann = payload["annotations"] |
| if payload["route"] == "speech": |
| if "voice_tags" in ann: |
| out.append("**`laion/voice-tagging-whisper` — voice tags:**") |
| out.append("") |
| out.append(_quote_caption(ann["voice_tags"])) |
| out.append("") |
| if "bud_e_speech_caption" in ann: |
| out.append("**`laion/BUD-E-Whisper_V1.2` — speech caption:**") |
| out.append("") |
| out.append(_quote_caption(ann["bud_e_speech_caption"])) |
| out.append("") |
| elif payload["route"] == "music": |
| if "music_caption" in ann: |
| out.append("**`laion/music-whisper` — music caption:**") |
| out.append("") |
| out.append(_quote_caption(ann["music_caption"])) |
| out.append("") |
| else: |
| if "sound_effect_caption" in ann: |
| out.append("**`laion/sound-effect-captioning-whisper` — sound caption:**") |
| out.append("") |
| out.append(_quote_caption(ann["sound_effect_caption"])) |
| out.append("") |
|
|
| out.append("---") |
| out.append("") |
| return "\n".join(out) |
|
|
|
|
| def collect_clips() -> list[dict]: |
| """Load every sample JSON sidecar and return a flat list of items.""" |
| items: list[dict] = [] |
| for ds in DATASETS: |
| json_files = [ |
| jf for jf in sorted((SAMPLES / ds["short"]).glob("*.json")) |
| if jf.name != "upstream_captions.json" |
| ] |
| for jf in json_files: |
| payload = json.loads(jf.read_text()) |
| audio_basename = payload["audio_file"] |
| audio_url = f"{HF_RESOLVE_BASE}/samples/{ds['short']}/{audio_basename}" |
| items.append({ |
| "dataset": ds, |
| "payload": payload, |
| "audio_url": audio_url, |
| }) |
| return items |
|
|
|
|
| def interleave_by_route(items: list[dict]) -> list[dict]: |
| """Interleave items so they alternate speech -> sfx -> music -> ... |
| |
| Inside each bucket the original (sorted) order is preserved. |
| Once one bucket runs out, items continue cycling through the |
| remaining buckets in the same order until everything is consumed. |
| """ |
| buckets: dict[str, list[dict]] = {"speech": [], "sfx": [], "music": []} |
| for it in items: |
| route = it["payload"]["route"] |
| buckets.setdefault(route, []).append(it) |
|
|
| order = ["speech", "sfx", "music"] |
| out: list[dict] = [] |
| while any(buckets[r] for r in order): |
| for r in order: |
| if buckets[r]: |
| out.append(buckets[r].pop(0)) |
| return out |
|
|
|
|
| def main() -> int: |
| items = collect_clips() |
| n_speech = sum(1 for it in items if it["payload"]["route"] == "speech") |
| n_sfx = sum(1 for it in items if it["payload"]["route"] == "sfx") |
| n_music = sum(1 for it in items if it["payload"]["route"] == "music") |
| total = len(items) |
|
|
| blocks: list[str] = ["## Sample annotations\n"] |
| blocks.append( |
| f"The pipeline below was run end-to-end on **{total} audio clips " |
| "drawn from four Hugging Face datasets** " |
| f"(routing breakdown: {n_speech} speech, {n_sfx} sfx, {n_music} music). " |
| "For each clip we show the top-3 AudioSet predictions from the " |
| "MIT AST router, the route the clip was dispatched to, and the " |
| "resulting Whisper caption / tags. The audio files themselves are " |
| "mirrored in this repo under [`samples/`](./samples) and embedded " |
| "inline below — press play to listen.\n\n" |
| "**Source datasets:**\n" |
| ) |
| for ds in DATASETS: |
| blocks.append(f"* [`{ds['label']}`]({ds['link']}) — {ds['blurb']}") |
| blocks.append("") |
| blocks.append( |
| "The clips are interleaved in a **speech → sfx → music** cycle " |
| "to make it easy to compare the three routing branches side by " |
| "side.\n" |
| ) |
|
|
| ordered = interleave_by_route(items) |
| for it in ordered: |
| blocks.append(render_clip(it)) |
|
|
| md = "\n".join(blocks) |
| out_path = SAMPLES / "results_block.md" |
| out_path.write_text(md) |
| print(f"Wrote {out_path} ({len(md)} chars, {len(ordered)} clips)") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|