aileen3-core / demo /problem_cell.py
ndurner's picture
add comments
0c163b8
from __future__ import annotations
import hashlib
import json
import os
import tempfile
from pathlib import Path
from urllib.parse import parse_qs, urlparse
import gradio as gr
try:
from yt_dlp import YoutubeDL
except ImportError: # pragma: no cover - yt-dlp is in requirements, but guard for clarity
YoutubeDL = None # type: ignore[assignment]
from layout import cell
DEFAULT_VIDEO_URL = "https://www.youtube.com/watch?v=Dvjg8R0jUAk"
SEARCH_TERM = "Notstaatsvertrag"
CORRECT_TERM = "NOOTS-Staatsvertrag"
SEARCH_LANGUAGES = ["de"]
HERE = Path(__file__).parent
ASSETS_DIR = HERE / "assets"
DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg"
BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3"))
TRANSCRIPTION_CACHE = BASE_CACHE / "transcription"
def _transcription_cache_path(reference: str) -> Path:
return TRANSCRIPTION_CACHE / f"{reference}.json"
def render_status_box(message: str, tone: str = "placeholder") -> str:
tone_class = {
"success": "health-success",
"fail": "health-fail",
"placeholder": "health-placeholder",
}.get(tone, "health-placeholder")
return f"<div class='health-box {tone_class}'>{message}</div>"
def _extract_video_id(video_url: str) -> str | None:
parsed = urlparse(video_url.strip())
if parsed.netloc.endswith("youtu.be"):
return parsed.path.lstrip("/") or None
if parsed.netloc.endswith("youtube.com"):
query = parse_qs(parsed.query)
if "v" in query and query["v"]:
return query["v"][0]
return None
def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
"""Retrieve or cache a plain-text transcript for the given YouTube URL.
For the purposes of this cell we rely on YouTube auto captions via
yt-dlp; the heavy-duty Gemini-based transcription lives in the MCP
tools and separate demo cells.
"""
TRANSCRIPTION_CACHE.mkdir(parents=True, exist_ok=True)
if YoutubeDL is None: # pragma: no cover - dependency should always be present
return None, "yt-dlp is not installed in this environment."
video_id = _extract_video_id(video_url)
if not video_id:
return None, "That does not look like a valid YouTube URL with a video id."
# Align cache layout with `media_tools`: transcription cache under
# BASE_CACHE/transcription using a stable reference derived from the
# YouTube video id when available. This keeps the demo and MCP server
# caches compatible and easier to inspect.
reference = f"youtube_{hashlib.sha256(video_id.encode('utf-8')).hexdigest()[:32]}"
cache_path = _transcription_cache_path(reference)
if cache_path.exists():
try:
cached = json.loads(cache_path.read_text(encoding="utf-8"))
except Exception:
cached = None
if isinstance(cached, str) and cached.strip():
return cached, None
with tempfile.TemporaryDirectory() as tmpdir:
output_template = str(Path(tmpdir) / "%(id)s.%(ext)s")
ydl_opts = {
"skip_download": True,
"writeautomaticsub": True,
"writesubtitles": False,
"subtitleslangs": SEARCH_LANGUAGES,
"subtitlesformat": "vtt",
"quiet": True,
"no_warnings": True,
"outtmpl": output_template,
"allow_playlist": False,
}
try:
with YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
except Exception as exc: # noqa: BLE001 - expose yt-dlp failures to the UI
return None, f"Could not download auto captions via yt-dlp: {exc}"
caption_files = sorted(Path(tmpdir).glob("*.vtt"))
if not caption_files:
return None, (
"No German or English automatic captions were available for this video. "
"Try providing a different language variant or another clip."
)
text_chunks = []
for file in caption_files:
payload = file.read_text(encoding="utf-8", errors="replace")
cleaned = _vtt_to_text(payload)
if cleaned:
text_chunks.append(cleaned)
readable = " ".join(text_chunks).strip()
if not readable:
return None, "Transcript was empty. Try again or choose another video."
try:
cache_path.write_text(json.dumps(readable), encoding="utf-8")
except Exception:
# Cache failures should not block the happy path.
pass
return readable, None
def _vtt_to_text(vtt_payload: str) -> str:
"""Strip timestamps/cue indices from VTT so we can search plain text."""
cleaned_lines = []
for raw_line in vtt_payload.splitlines():
line = raw_line.strip()
if not line or line.upper().startswith("WEBVTT"):
continue
if "-->" in line: # timestamp cue
continue
if line.isdigit(): # cue index
continue
cleaned_lines.append(line)
return " ".join(cleaned_lines)
def analyze_transcript(video_url: str | None = None) -> tuple[str, str]:
transcript_text, error = _fetch_transcript(video_url or DEFAULT_VIDEO_URL)
if error:
return render_status_box(error, "fail"), ""
normalized = transcript_text.lower()
found_term = SEARCH_TERM.lower() in normalized
if found_term:
headline = (
f"🚨 We spotted “{SEARCH_TERM}” in this transcript — a hallucinated emergency-state framing."
)
tone = "fail"
else:
headline = (
f"✅ “{SEARCH_TERM}” does **not** show up in the transcript. "
f"The speaker consistently references {CORRECT_TERM}."
)
tone = "success"
result_line = (
"Result: the ASR output hallucinated an emergency-state treaty reference."
if found_term
else "Result: the captions stay with NOOTS – no emergency-state treaty was mentioned."
)
body = [
f"**Search term**: “{SEARCH_TERM}”.",
f"**{result_line}**",
"",
f"- **{SEARCH_TERM}** → “emergency state treaty” – suggests constitutional crisis powers.",
f"- **{CORRECT_TERM}** → “National Once-Only Technical System treaty” – "
"a data-sharing infrastructure for German public administrations.",
"",
"Mishearing “NOOTS” as “Not” is an *ASR hallucination*. When an LLM then riffs on "
"that wrong token, it creates a second-layer hallucination that falsely claims an emergency "
"law was debated. In reality, the Smart Country convention session discussed register modernisation and once-only data exchange.",
]
return render_status_box(headline, tone), "\n".join(body)
def render_problem_cell() -> None:
with cell("ℹ️ Problem: ASR hallucinations"):
gr.Markdown(
f"""### 👩🏻‍🏫 Background
Automatically generated transcripts and subtitles provided by video or podcast distribution sites may appear as a straightforward
source to ground summaries or chat-with-your-video use cases in. With YouTube in particular, however, there is a systemic hallucination risk:
the anti-money laundering directive "NIS2" may become "these two", the IT concept of "interoperability" may become the unrelated quality of
"endurability"... and the data sharing treaty for public administration 🇩🇪 "NOOTS-Staatsvertrag" may become emergency state powers
🇩🇪 "Notstaatsvertrag". Particularly with non-English languages or non-native speakers of the English language, the hallucination risk
from Automatic Speech Recognition (ASR) and the hallucination risk from chatbot Large Language Models compound - rendering e.g. ChatGPT Atlas
a brittle tool for such tasks.
""",
)
gr.Image(
value=DIGITALGIPFEL_IMG,
show_label=True,
interactive=False,
elem_id="digitalgipfel-photo",
label='ASR trip: "asset" turns into "acid"'
)
gr.Markdown("""### 💁🏻‍♀️ Demo
We're going to download the YouTube subtitles of a panel discussion
recorded at the Smart Country Convention 2025 - and check if the ASR hallucinated emergency state powers (❌) or got
the German language term "NOOTS-Staatsvertrag" right (✅). The goal is to make it visible how ASR recognition could
cause faulty LLM interpretation built on top of them.
""")
url_box = gr.Textbox(
label="YouTube video URL",
value=DEFAULT_VIDEO_URL,
interactive=False,
)
check_button = gr.Button("Check transcript for “Notstaatsvertrag”", variant="primary")
result_panel = gr.HTML(
value=render_status_box(
"👉 Click “Check transcript…” to fetch the captions and verify what was actually said.",
"placeholder",
)
)
result_details = gr.Markdown(visible=True)
check_button.click(
fn=analyze_transcript,
inputs=url_box,
outputs=[result_panel, result_details],
queue=False,
)