File size: 8,260 Bytes
6e8db4b
0544bfc
6e8db4b
 
067d9ba
 
 
 
0544bfc
 
 
 
 
 
 
 
 
067d9ba
 
 
 
 
 
0544bfc
 
 
067d9ba
0544bfc
 
 
 
 
 
067d9ba
0544bfc
 
 
 
 
 
067d9ba
0544bfc
 
 
6e8db4b
 
 
067d9ba
6e8db4b
 
067d9ba
6c550d3
 
0544bfc
 
 
 
6c550d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
067d9ba
 
 
 
 
0544bfc
 
 
6e8db4b
067d9ba
 
 
 
 
6e8db4b
0544bfc
 
 
 
 
 
067d9ba
 
 
0544bfc
 
 
 
 
 
 
 
 
6c550d3
0544bfc
 
 
 
6c550d3
0544bfc
 
 
 
 
6c550d3
0544bfc
 
 
 
 
6c550d3
0544bfc
 
 
 
 
 
 
067d9ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0544bfc
 
 
067d9ba
 
 
 
 
 
 
0544bfc
067d9ba
 
 
 
 
 
 
 
 
0544bfc
 
067d9ba
 
 
 
 
 
 
 
 
 
 
 
0544bfc
 
 
067d9ba
0544bfc
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""Render the sample annotations as a Markdown block to embed in README.md.

Reads samples/{audioset,music,majestrino,freesound}/*.json and writes the
Markdown to samples/results_block.md.

Audio is embedded with HTML5 <audio> tags pointing at the model repo's
`resolve/main` endpoint on Hugging Face, so the players actually work
when the README is rendered as the model card.
"""
from __future__ import annotations

import json
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
SAMPLES = ROOT / "samples"

# Hugging Face model repo where this README will live. The audio files
# are mirrored in the repo under samples/<short>/, and we serve them
# through the resolve/main endpoint so the <audio> players work.
HF_REPO = "laion/whisper-captioning-ensemble"
HF_RESOLVE_BASE = f"https://huggingface.co/{HF_REPO}/resolve/main"

DATASETS = [
    {
        "short": "audioset",
        "label": "mitermix/audioset-with-grounded-captions",
        "link":  "https://huggingface.co/datasets/mitermix/audioset-with-grounded-captions",
        "blurb": "AudioSet-derived clips with mixed content (speech, music, "
                 "sound effects) β€” a good test of all three routes.",
    },
    {
        "short": "music",
        "label": "laion/captioned-ai-music-snippets",
        "link":  "https://huggingface.co/datasets/laion/captioned-ai-music-snippets",
        "blurb": "AI-generated music snippets, primarily routed to the music "
                 "captioner.",
    },
    {
        "short": "majestrino",
        "label": "TTS-AGI/majestrino-unified-detailed-captions-temporal",
        "link":  "https://huggingface.co/datasets/TTS-AGI/majestrino-unified-detailed-captions-temporal",
        "blurb": "High-quality TTS-style speech recordings, primarily routed "
                 "to the speech models.",
    },
    {
        "short": "freesound",
        "label": "laion/freesound-commercially-permissive-subset-with-captions",
        "link":  "https://huggingface.co/datasets/laion/freesound-commercially-permissive-subset-with-captions",
        "blurb": "Curated commercially-permissive Freesound clips β€” 10 "
                 "examples that the AST router classified as something "
                 "other than speech or music, so they are routed through "
                 "the general-purpose sound-effect captioner.",
    },
]


def _escape_md_emphasis(text: str) -> str:
    """Escape characters that would otherwise toggle Markdown emphasis.

    Hugging Face renders blockquotes (`> ...`) in italic by default, so
    a stray ``*`` or ``_`` inside a caption flips the italic off mid-text
    and the rest of the paragraph appears in upright type. Backslash-
    escaping these (and ``~`` for strikethrough) keeps the blockquote
    italic from end to end.
    """
    return (
        text.replace("\\", "\\\\")
            .replace("*", "\\*")
            .replace("_", "\\_")
            .replace("~", "\\~")
    )


def _quote_caption(text: str) -> str:
    """Render multi-line caption text as a single blockquote."""
    text = _escape_md_emphasis(text.strip())
    lines = text.split("\n")
    return "\n".join(f"> {ln}" if ln.strip() else ">" for ln in lines)


def render_clip(item: dict) -> str:
    payload  = item["payload"]
    audio    = item["audio_url"]
    ds       = item["dataset"]

    out: list[str] = []
    out.append(f"#### `{payload['audio_file']}`")
    out.append("")
    out.append(
        f'<audio controls preload="none" src="{audio}"></audio>'
    )
    out.append("")
    out.append(
        f"_Source dataset: [`{ds['label']}`]({ds['link']})_"
    )
    out.append("")

    out.append("**AudioSet top-3 predictions** (MIT AST):")
    out.append("")
    out.append("| # | Label | Confidence |")
    out.append("|---|---|---|")
    for i, top in enumerate(payload["audioset_top3"], 1):
        pct = 100.0 * top["confidence"]
        out.append(f"| {i} | `{top['label']}` | {pct:.1f}% |")
    out.append("")
    out.append(f"**Route:** `{payload['route']}`")
    out.append("")

    ann = payload["annotations"]
    if payload["route"] == "speech":
        if "voice_tags" in ann:
            out.append("**`laion/voice-tagging-whisper` β€” voice tags:**")
            out.append("")
            out.append(_quote_caption(ann["voice_tags"]))
            out.append("")
        if "bud_e_speech_caption" in ann:
            out.append("**`laion/BUD-E-Whisper_V1.2` β€” speech caption:**")
            out.append("")
            out.append(_quote_caption(ann["bud_e_speech_caption"]))
            out.append("")
    elif payload["route"] == "music":
        if "music_caption" in ann:
            out.append("**`laion/music-whisper` β€” music caption:**")
            out.append("")
            out.append(_quote_caption(ann["music_caption"]))
            out.append("")
    else:  # sfx
        if "sound_effect_caption" in ann:
            out.append("**`laion/sound-effect-captioning-whisper` β€” sound caption:**")
            out.append("")
            out.append(_quote_caption(ann["sound_effect_caption"]))
            out.append("")

    out.append("---")
    out.append("")
    return "\n".join(out)


def collect_clips() -> list[dict]:
    """Load every sample JSON sidecar and return a flat list of items."""
    items: list[dict] = []
    for ds in DATASETS:
        json_files = [
            jf for jf in sorted((SAMPLES / ds["short"]).glob("*.json"))
            if jf.name != "upstream_captions.json"
        ]
        for jf in json_files:
            payload = json.loads(jf.read_text())
            audio_basename = payload["audio_file"]
            audio_url = f"{HF_RESOLVE_BASE}/samples/{ds['short']}/{audio_basename}"
            items.append({
                "dataset":  ds,
                "payload":  payload,
                "audio_url": audio_url,
            })
    return items


def interleave_by_route(items: list[dict]) -> list[dict]:
    """Interleave items so they alternate speech -> sfx -> music -> ...

    Inside each bucket the original (sorted) order is preserved.
    Once one bucket runs out, items continue cycling through the
    remaining buckets in the same order until everything is consumed.
    """
    buckets: dict[str, list[dict]] = {"speech": [], "sfx": [], "music": []}
    for it in items:
        route = it["payload"]["route"]
        buckets.setdefault(route, []).append(it)

    order = ["speech", "sfx", "music"]
    out: list[dict] = []
    while any(buckets[r] for r in order):
        for r in order:
            if buckets[r]:
                out.append(buckets[r].pop(0))
    return out


def main() -> int:
    items = collect_clips()
    n_speech = sum(1 for it in items if it["payload"]["route"] == "speech")
    n_sfx    = sum(1 for it in items if it["payload"]["route"] == "sfx")
    n_music  = sum(1 for it in items if it["payload"]["route"] == "music")
    total    = len(items)

    blocks: list[str] = ["## Sample annotations\n"]
    blocks.append(
        f"The pipeline below was run end-to-end on **{total} audio clips "
        "drawn from four Hugging Face datasets** "
        f"(routing breakdown: {n_speech} speech, {n_sfx} sfx, {n_music} music). "
        "For each clip we show the top-3 AudioSet predictions from the "
        "MIT AST router, the route the clip was dispatched to, and the "
        "resulting Whisper caption / tags. The audio files themselves are "
        "mirrored in this repo under [`samples/`](./samples) and embedded "
        "inline below β€” press play to listen.\n\n"
        "**Source datasets:**\n"
    )
    for ds in DATASETS:
        blocks.append(f"* [`{ds['label']}`]({ds['link']}) β€” {ds['blurb']}")
    blocks.append("")
    blocks.append(
        "The clips are interleaved in a **speech β†’ sfx β†’ music** cycle "
        "to make it easy to compare the three routing branches side by "
        "side.\n"
    )

    ordered = interleave_by_route(items)
    for it in ordered:
        blocks.append(render_clip(it))

    md = "\n".join(blocks)
    out_path = SAMPLES / "results_block.md"
    out_path.write_text(md)
    print(f"Wrote {out_path}  ({len(md)} chars, {len(ordered)} clips)")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())