ChristophSchuhmann commited on
Commit
0544bfc
·
verified ·
1 Parent(s): 4aed451

Upload scripts/render_results.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/render_results.py +130 -0
scripts/render_results.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Render the 18 sample annotations as a Markdown block to embed in README.md.
2
+
3
+ Reads samples/{audioset,music,majestrino}/*.json and writes the Markdown to
4
+ samples/results_block.md.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ from pathlib import Path
10
+
11
+ ROOT = Path(__file__).resolve().parent.parent
12
+ SAMPLES = ROOT / "samples"
13
+
14
+ DATASETS = [
15
+ {
16
+ "short": "audioset",
17
+ "title": "Source: `mitermix/audioset-with-grounded-captions`",
18
+ "link": "https://huggingface.co/datasets/mitermix/audioset-with-grounded-captions",
19
+ "blurb": "AudioSet-derived clips with mixed content (speech, music, "
20
+ "sound effects) — a good test of all three routes.",
21
+ },
22
+ {
23
+ "short": "music",
24
+ "title": "Source: `laion/captioned-ai-music-snippets`",
25
+ "link": "https://huggingface.co/datasets/laion/captioned-ai-music-snippets",
26
+ "blurb": "AI-generated music snippets, primarily routed to the music "
27
+ "captioner.",
28
+ },
29
+ {
30
+ "short": "majestrino",
31
+ "title": "Source: `TTS-AGI/majestrino-unified-detailed-captions-temporal`",
32
+ "link": "https://huggingface.co/datasets/TTS-AGI/majestrino-unified-detailed-captions-temporal",
33
+ "blurb": "High-quality TTS-style speech recordings, primarily routed "
34
+ "to the speech models.",
35
+ },
36
+ ]
37
+
38
+
39
+ def render_clip(payload: dict, audio_rel: str) -> str:
40
+ out: list[str] = []
41
+ out.append(f"#### `{payload['audio_file']}`")
42
+ out.append("")
43
+ out.append(f"[Listen / download]({audio_rel})")
44
+ out.append("")
45
+
46
+ out.append("**AudioSet top-3 predictions** (MIT AST):")
47
+ out.append("")
48
+ out.append("| # | Label | Confidence |")
49
+ out.append("|---|---|---|")
50
+ for i, item in enumerate(payload["audioset_top3"], 1):
51
+ pct = 100.0 * item["confidence"]
52
+ out.append(f"| {i} | `{item['label']}` | {pct:.1f}% |")
53
+ out.append("")
54
+ out.append(f"**Route:** `{payload['route']}`")
55
+ out.append("")
56
+
57
+ ann = payload["annotations"]
58
+ if payload["route"] == "speech":
59
+ if "voice_tags" in ann:
60
+ out.append("**`laion/voice-tagging-whisper` — voice tags:**")
61
+ out.append("")
62
+ out.append(f"> {ann['voice_tags']}")
63
+ out.append("")
64
+ if "bud_e_speech_caption" in ann:
65
+ out.append("**`laion/BUD-E-Whisper_V1.2` — speech caption:**")
66
+ out.append("")
67
+ out.append(f"> {ann['bud_e_speech_caption']}")
68
+ out.append("")
69
+ elif payload["route"] == "music":
70
+ if "music_caption" in ann:
71
+ out.append("**`laion/music-whisper` — music caption:**")
72
+ out.append("")
73
+ out.append(f"> {ann['music_caption']}")
74
+ out.append("")
75
+ else: # sfx
76
+ if "sound_effect_caption" in ann:
77
+ out.append("**`laion/sound-effect-captioning-whisper` — sound caption:**")
78
+ out.append("")
79
+ out.append(f"> {ann['sound_effect_caption']}")
80
+ out.append("")
81
+
82
+ out.append("---")
83
+ out.append("")
84
+ return "\n".join(out)
85
+
86
+
87
+ def render_dataset(ds: dict) -> str:
88
+ out: list[str] = []
89
+ out.append(f"### {ds['title']}")
90
+ out.append("")
91
+ out.append(f"{ds['blurb']} \nDataset: {ds['link']}")
92
+ out.append("")
93
+
94
+ json_files = sorted((SAMPLES / ds["short"]).glob("*.json"))
95
+ if not json_files:
96
+ out.append("_(no samples)_")
97
+ return "\n".join(out)
98
+
99
+ for jf in json_files:
100
+ payload = json.loads(jf.read_text())
101
+ # Audio file lives next to the JSON. We expose it via a relative path
102
+ # from the README at repo root, i.e. ./samples/<short>/<basename>
103
+ audio_basename = payload["audio_file"]
104
+ audio_rel = f"./samples/{ds['short']}/{audio_basename}"
105
+ out.append(render_clip(payload, audio_rel))
106
+ return "\n".join(out)
107
+
108
+
109
+ def main() -> int:
110
+ blocks = ["## Sample annotations\n"]
111
+ blocks.append(
112
+ "The pipeline below was run end-to-end on **6 random audio clips "
113
+ "drawn from each of three Hugging Face datasets** "
114
+ "(18 clips total). For each clip we show the top-3 AudioSet "
115
+ "predictions from the MIT AST router, the route the clip was "
116
+ "dispatched to, and the resulting Whisper caption / tags. The "
117
+ "audio files themselves are mirrored in this repo under "
118
+ "[`samples/`](./samples) so you can listen along.\n"
119
+ )
120
+ for ds in DATASETS:
121
+ blocks.append(render_dataset(ds))
122
+ md = "\n".join(blocks)
123
+ out_path = SAMPLES / "results_block.md"
124
+ out_path.write_text(md)
125
+ print(f"Wrote {out_path} ({len(md)} chars)")
126
+ return 0
127
+
128
+
129
+ if __name__ == "__main__":
130
+ raise SystemExit(main())