Update scripts/render_results.py
Browse files- scripts/render_results.py +30 -31
scripts/render_results.py
CHANGED
|
@@ -10,7 +10,6 @@ when the README is rendered as the model card.
|
|
| 10 |
from __future__ import annotations
|
| 11 |
|
| 12 |
import json
|
| 13 |
-
from itertools import zip_longest
|
| 14 |
from pathlib import Path
|
| 15 |
|
| 16 |
ROOT = Path(__file__).resolve().parent.parent
|
|
@@ -50,17 +49,39 @@ DATASETS = [
|
|
| 50 |
"link": "https://huggingface.co/datasets/laion/freesound-commercially-permissive-subset-with-captions",
|
| 51 |
"blurb": "Curated commercially-permissive Freesound clips — 10 "
|
| 52 |
"examples that the AST router classified as something "
|
| 53 |
-
"other than speech or music
|
| 54 |
-
"the
|
| 55 |
-
"with_upstream": True,
|
| 56 |
},
|
| 57 |
]
|
| 58 |
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def render_clip(item: dict) -> str:
|
| 61 |
payload = item["payload"]
|
| 62 |
audio = item["audio_url"]
|
| 63 |
-
upstream = item.get("upstream")
|
| 64 |
ds = item["dataset"]
|
| 65 |
|
| 66 |
out: list[str] = []
|
|
@@ -75,21 +96,6 @@ def render_clip(item: dict) -> str:
|
|
| 75 |
)
|
| 76 |
out.append("")
|
| 77 |
|
| 78 |
-
if upstream is not None:
|
| 79 |
-
cap = upstream.get("comprehensive_caption")
|
| 80 |
-
fid = upstream.get("freesound_id")
|
| 81 |
-
if cap:
|
| 82 |
-
out.append("**Upstream dataset caption:**")
|
| 83 |
-
out.append("")
|
| 84 |
-
if fid:
|
| 85 |
-
out.append(
|
| 86 |
-
f"> {cap} \n> _(Freesound id "
|
| 87 |
-
f"[`{fid}`](https://freesound.org/s/{fid}/))_"
|
| 88 |
-
)
|
| 89 |
-
else:
|
| 90 |
-
out.append(f"> {cap}")
|
| 91 |
-
out.append("")
|
| 92 |
-
|
| 93 |
out.append("**AudioSet top-3 predictions** (MIT AST):")
|
| 94 |
out.append("")
|
| 95 |
out.append("| # | Label | Confidence |")
|
|
@@ -106,24 +112,24 @@ def render_clip(item: dict) -> str:
|
|
| 106 |
if "voice_tags" in ann:
|
| 107 |
out.append("**`laion/voice-tagging-whisper` — voice tags:**")
|
| 108 |
out.append("")
|
| 109 |
-
out.append(
|
| 110 |
out.append("")
|
| 111 |
if "bud_e_speech_caption" in ann:
|
| 112 |
out.append("**`laion/BUD-E-Whisper_V1.2` — speech caption:**")
|
| 113 |
out.append("")
|
| 114 |
-
out.append(
|
| 115 |
out.append("")
|
| 116 |
elif payload["route"] == "music":
|
| 117 |
if "music_caption" in ann:
|
| 118 |
out.append("**`laion/music-whisper` — music caption:**")
|
| 119 |
out.append("")
|
| 120 |
-
out.append(
|
| 121 |
out.append("")
|
| 122 |
else: # sfx
|
| 123 |
if "sound_effect_caption" in ann:
|
| 124 |
out.append("**`laion/sound-effect-captioning-whisper` — sound caption:**")
|
| 125 |
out.append("")
|
| 126 |
-
out.append(
|
| 127 |
out.append("")
|
| 128 |
|
| 129 |
out.append("---")
|
|
@@ -135,12 +141,6 @@ def collect_clips() -> list[dict]:
|
|
| 135 |
"""Load every sample JSON sidecar and return a flat list of items."""
|
| 136 |
items: list[dict] = []
|
| 137 |
for ds in DATASETS:
|
| 138 |
-
upstream_map: dict[str, dict] = {}
|
| 139 |
-
if ds.get("with_upstream"):
|
| 140 |
-
up_path = SAMPLES / ds["short"] / "upstream_captions.json"
|
| 141 |
-
if up_path.exists():
|
| 142 |
-
upstream_map = json.loads(up_path.read_text())
|
| 143 |
-
|
| 144 |
json_files = [
|
| 145 |
jf for jf in sorted((SAMPLES / ds["short"]).glob("*.json"))
|
| 146 |
if jf.name != "upstream_captions.json"
|
|
@@ -153,7 +153,6 @@ def collect_clips() -> list[dict]:
|
|
| 153 |
"dataset": ds,
|
| 154 |
"payload": payload,
|
| 155 |
"audio_url": audio_url,
|
| 156 |
-
"upstream": upstream_map.get(audio_basename),
|
| 157 |
})
|
| 158 |
return items
|
| 159 |
|
|
|
|
| 10 |
from __future__ import annotations
|
| 11 |
|
| 12 |
import json
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
|
| 15 |
ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
| 49 |
"link": "https://huggingface.co/datasets/laion/freesound-commercially-permissive-subset-with-captions",
|
| 50 |
"blurb": "Curated commercially-permissive Freesound clips — 10 "
|
| 51 |
"examples that the AST router classified as something "
|
| 52 |
+
"other than speech or music, so they are routed through "
|
| 53 |
+
"the general-purpose sound-effect captioner.",
|
|
|
|
| 54 |
},
|
| 55 |
]
|
| 56 |
|
| 57 |
|
| 58 |
+
def _escape_md_emphasis(text: str) -> str:
|
| 59 |
+
"""Escape characters that would otherwise toggle Markdown emphasis.
|
| 60 |
+
|
| 61 |
+
Hugging Face renders blockquotes (`> ...`) in italic by default, so
|
| 62 |
+
a stray ``*`` or ``_`` inside a caption flips the italic off mid-text
|
| 63 |
+
and the rest of the paragraph appears in upright type. Backslash-
|
| 64 |
+
escaping these (and ``~`` for strikethrough) keeps the blockquote
|
| 65 |
+
italic from end to end.
|
| 66 |
+
"""
|
| 67 |
+
return (
|
| 68 |
+
text.replace("\\", "\\\\")
|
| 69 |
+
.replace("*", "\\*")
|
| 70 |
+
.replace("_", "\\_")
|
| 71 |
+
.replace("~", "\\~")
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _quote_caption(text: str) -> str:
|
| 76 |
+
"""Render multi-line caption text as a single blockquote."""
|
| 77 |
+
text = _escape_md_emphasis(text.strip())
|
| 78 |
+
lines = text.split("\n")
|
| 79 |
+
return "\n".join(f"> {ln}" if ln.strip() else ">" for ln in lines)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
def render_clip(item: dict) -> str:
|
| 83 |
payload = item["payload"]
|
| 84 |
audio = item["audio_url"]
|
|
|
|
| 85 |
ds = item["dataset"]
|
| 86 |
|
| 87 |
out: list[str] = []
|
|
|
|
| 96 |
)
|
| 97 |
out.append("")
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
out.append("**AudioSet top-3 predictions** (MIT AST):")
|
| 100 |
out.append("")
|
| 101 |
out.append("| # | Label | Confidence |")
|
|
|
|
| 112 |
if "voice_tags" in ann:
|
| 113 |
out.append("**`laion/voice-tagging-whisper` — voice tags:**")
|
| 114 |
out.append("")
|
| 115 |
+
out.append(_quote_caption(ann["voice_tags"]))
|
| 116 |
out.append("")
|
| 117 |
if "bud_e_speech_caption" in ann:
|
| 118 |
out.append("**`laion/BUD-E-Whisper_V1.2` — speech caption:**")
|
| 119 |
out.append("")
|
| 120 |
+
out.append(_quote_caption(ann["bud_e_speech_caption"]))
|
| 121 |
out.append("")
|
| 122 |
elif payload["route"] == "music":
|
| 123 |
if "music_caption" in ann:
|
| 124 |
out.append("**`laion/music-whisper` — music caption:**")
|
| 125 |
out.append("")
|
| 126 |
+
out.append(_quote_caption(ann["music_caption"]))
|
| 127 |
out.append("")
|
| 128 |
else: # sfx
|
| 129 |
if "sound_effect_caption" in ann:
|
| 130 |
out.append("**`laion/sound-effect-captioning-whisper` — sound caption:**")
|
| 131 |
out.append("")
|
| 132 |
+
out.append(_quote_caption(ann["sound_effect_caption"]))
|
| 133 |
out.append("")
|
| 134 |
|
| 135 |
out.append("---")
|
|
|
|
| 141 |
"""Load every sample JSON sidecar and return a flat list of items."""
|
| 142 |
items: list[dict] = []
|
| 143 |
for ds in DATASETS:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
json_files = [
|
| 145 |
jf for jf in sorted((SAMPLES / ds["short"]).glob("*.json"))
|
| 146 |
if jf.name != "upstream_captions.json"
|
|
|
|
| 153 |
"dataset": ds,
|
| 154 |
"payload": payload,
|
| 155 |
"audio_url": audio_url,
|
|
|
|
| 156 |
})
|
| 157 |
return items
|
| 158 |
|