Spaces:
Running
Running
File size: 9,320 Bytes
9188dd0 982628c 9188dd0 982628c 9188dd0 0c163b8 982628c 9188dd0 982628c 0c163b8 982628c 9188dd0 982628c 9188dd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
from __future__ import annotations
import hashlib
import json
import os
import tempfile
from pathlib import Path
from urllib.parse import parse_qs, urlparse
import gradio as gr
try:
from yt_dlp import YoutubeDL
except ImportError: # pragma: no cover - yt-dlp is in requirements, but guard for clarity
YoutubeDL = None # type: ignore[assignment]
from layout import cell
DEFAULT_VIDEO_URL = "https://www.youtube.com/watch?v=Dvjg8R0jUAk"
SEARCH_TERM = "Notstaatsvertrag"
CORRECT_TERM = "NOOTS-Staatsvertrag"
SEARCH_LANGUAGES = ["de"]
HERE = Path(__file__).parent
ASSETS_DIR = HERE / "assets"
DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg"
BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3"))
TRANSCRIPTION_CACHE = BASE_CACHE / "transcription"
def _transcription_cache_path(reference: str) -> Path:
return TRANSCRIPTION_CACHE / f"{reference}.json"
def render_status_box(message: str, tone: str = "placeholder") -> str:
tone_class = {
"success": "health-success",
"fail": "health-fail",
"placeholder": "health-placeholder",
}.get(tone, "health-placeholder")
return f"<div class='health-box {tone_class}'>{message}</div>"
def _extract_video_id(video_url: str) -> str | None:
parsed = urlparse(video_url.strip())
if parsed.netloc.endswith("youtu.be"):
return parsed.path.lstrip("/") or None
if parsed.netloc.endswith("youtube.com"):
query = parse_qs(parsed.query)
if "v" in query and query["v"]:
return query["v"][0]
return None
def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
"""Retrieve or cache a plain-text transcript for the given YouTube URL.
For the purposes of this cell we rely on YouTube auto captions via
yt-dlp; the heavy-duty Gemini-based transcription lives in the MCP
tools and separate demo cells.
"""
TRANSCRIPTION_CACHE.mkdir(parents=True, exist_ok=True)
if YoutubeDL is None: # pragma: no cover - dependency should always be present
return None, "yt-dlp is not installed in this environment."
video_id = _extract_video_id(video_url)
if not video_id:
return None, "That does not look like a valid YouTube URL with a video id."
# Align cache layout with `media_tools`: transcription cache under
# BASE_CACHE/transcription using a stable reference derived from the
# YouTube video id when available. This keeps the demo and MCP server
# caches compatible and easier to inspect.
reference = f"youtube_{hashlib.sha256(video_id.encode('utf-8')).hexdigest()[:32]}"
cache_path = _transcription_cache_path(reference)
if cache_path.exists():
try:
cached = json.loads(cache_path.read_text(encoding="utf-8"))
except Exception:
cached = None
if isinstance(cached, str) and cached.strip():
return cached, None
with tempfile.TemporaryDirectory() as tmpdir:
output_template = str(Path(tmpdir) / "%(id)s.%(ext)s")
ydl_opts = {
"skip_download": True,
"writeautomaticsub": True,
"writesubtitles": False,
"subtitleslangs": SEARCH_LANGUAGES,
"subtitlesformat": "vtt",
"quiet": True,
"no_warnings": True,
"outtmpl": output_template,
"allow_playlist": False,
}
try:
with YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
except Exception as exc: # noqa: BLE001 - expose yt-dlp failures to the UI
return None, f"Could not download auto captions via yt-dlp: {exc}"
caption_files = sorted(Path(tmpdir).glob("*.vtt"))
if not caption_files:
return None, (
"No German or English automatic captions were available for this video. "
"Try providing a different language variant or another clip."
)
text_chunks = []
for file in caption_files:
payload = file.read_text(encoding="utf-8", errors="replace")
cleaned = _vtt_to_text(payload)
if cleaned:
text_chunks.append(cleaned)
readable = " ".join(text_chunks).strip()
if not readable:
return None, "Transcript was empty. Try again or choose another video."
try:
cache_path.write_text(json.dumps(readable), encoding="utf-8")
except Exception:
# Cache failures should not block the happy path.
pass
return readable, None
def _vtt_to_text(vtt_payload: str) -> str:
"""Strip timestamps/cue indices from VTT so we can search plain text."""
cleaned_lines = []
for raw_line in vtt_payload.splitlines():
line = raw_line.strip()
if not line or line.upper().startswith("WEBVTT"):
continue
if "-->" in line: # timestamp cue
continue
if line.isdigit(): # cue index
continue
cleaned_lines.append(line)
return " ".join(cleaned_lines)
def analyze_transcript(video_url: str | None = None) -> tuple[str, str]:
transcript_text, error = _fetch_transcript(video_url or DEFAULT_VIDEO_URL)
if error:
return render_status_box(error, "fail"), ""
normalized = transcript_text.lower()
found_term = SEARCH_TERM.lower() in normalized
if found_term:
headline = (
f"🚨 We spotted “{SEARCH_TERM}” in this transcript — a hallucinated emergency-state framing."
)
tone = "fail"
else:
headline = (
f"✅ “{SEARCH_TERM}” does **not** show up in the transcript. "
f"The speaker consistently references {CORRECT_TERM}."
)
tone = "success"
result_line = (
"Result: the ASR output hallucinated an emergency-state treaty reference."
if found_term
else "Result: the captions stay with NOOTS – no emergency-state treaty was mentioned."
)
body = [
f"**Search term**: “{SEARCH_TERM}”.",
f"**{result_line}**",
"",
f"- **{SEARCH_TERM}** → “emergency state treaty” – suggests constitutional crisis powers.",
f"- **{CORRECT_TERM}** → “National Once-Only Technical System treaty” – "
"a data-sharing infrastructure for German public administrations.",
"",
"Mishearing “NOOTS” as “Not” is an *ASR hallucination*. When an LLM then riffs on "
"that wrong token, it creates a second-layer hallucination that falsely claims an emergency "
"law was debated. In reality, the Smart Country convention session discussed register modernisation and once-only data exchange.",
]
return render_status_box(headline, tone), "\n".join(body)
def render_problem_cell() -> None:
with cell("ℹ️ Problem: ASR hallucinations"):
gr.Markdown(
f"""### 👩🏻🏫 Background
Automatically generated transcripts and subtitles provided by video or podcast distribution sites may appear as a straightforward
source to ground summaries or chat-with-your-video use cases in. With YouTube in particular, however, there is a systemic hallucination risk:
the anti-money laundering directive "NIS2" may become "these two", the IT concept of "interoperability" may become the unrelated quality of
"endurability"... and the data sharing treaty for public administration 🇩🇪 "NOOTS-Staatsvertrag" may become emergency state powers
🇩🇪 "Notstaatsvertrag". Particularly with non-English languages or non-native speakers of the English language, the hallucination risk
from Automatic Speech Recognition (ASR) and the hallucination risk from chatbot Large Language Models compound - rendering e.g. ChatGPT Atlas
a brittle tool for such tasks.
""",
)
gr.Image(
value=DIGITALGIPFEL_IMG,
show_label=True,
interactive=False,
elem_id="digitalgipfel-photo",
label='ASR trip: "asset" turns into "acid"'
)
gr.Markdown("""### 💁🏻♀️ Demo
We're going to download the YouTube subtitles of a panel discussion
recorded at the Smart Country Convention 2025 - and check if the ASR hallucinated emergency state powers (❌) or got
the German language term "NOOTS-Staatsvertrag" right (✅). The goal is to make it visible how ASR recognition could
cause faulty LLM interpretation built on top of them.
""")
url_box = gr.Textbox(
label="YouTube video URL",
value=DEFAULT_VIDEO_URL,
interactive=False,
)
check_button = gr.Button("Check transcript for “Notstaatsvertrag”", variant="primary")
result_panel = gr.HTML(
value=render_status_box(
"👉 Click “Check transcript…” to fetch the captions and verify what was actually said.",
"placeholder",
)
)
result_details = gr.Markdown(visible=True)
check_button.click(
fn=analyze_transcript,
inputs=url_box,
outputs=[result_panel, result_details],
queue=False,
)
|