Spaces:
Running
Running
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| from urllib.parse import parse_qs, urlparse | |
| import gradio as gr | |
| try: | |
| from yt_dlp import YoutubeDL | |
| except ImportError: # pragma: no cover - yt-dlp is in requirements, but guard for clarity | |
| YoutubeDL = None # type: ignore[assignment] | |
| from layout import cell | |
| DEFAULT_VIDEO_URL = "https://www.youtube.com/watch?v=Dvjg8R0jUAk" | |
| SEARCH_TERM = "Notstaatsvertrag" | |
| CORRECT_TERM = "NOOTS-Staatsvertrag" | |
| SEARCH_LANGUAGES = ["de"] | |
| HERE = Path(__file__).parent | |
| ASSETS_DIR = HERE / "assets" | |
| DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg" | |
| BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3")) | |
| TRANSCRIPTION_CACHE = BASE_CACHE / "transcription" | |
| def _transcription_cache_path(reference: str) -> Path: | |
| return TRANSCRIPTION_CACHE / f"{reference}.json" | |
| def render_status_box(message: str, tone: str = "placeholder") -> str: | |
| tone_class = { | |
| "success": "health-success", | |
| "fail": "health-fail", | |
| "placeholder": "health-placeholder", | |
| }.get(tone, "health-placeholder") | |
| return f"<div class='health-box {tone_class}'>{message}</div>" | |
| def _extract_video_id(video_url: str) -> str | None: | |
| parsed = urlparse(video_url.strip()) | |
| if parsed.netloc.endswith("youtu.be"): | |
| return parsed.path.lstrip("/") or None | |
| if parsed.netloc.endswith("youtube.com"): | |
| query = parse_qs(parsed.query) | |
| if "v" in query and query["v"]: | |
| return query["v"][0] | |
| return None | |
| def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]: | |
| """Retrieve or cache a plain-text transcript for the given YouTube URL. | |
| For the purposes of this cell we rely on YouTube auto captions via | |
| yt-dlp; the heavy-duty Gemini-based transcription lives in the MCP | |
| tools and separate demo cells. | |
| """ | |
| TRANSCRIPTION_CACHE.mkdir(parents=True, exist_ok=True) | |
| if YoutubeDL is None: # pragma: no cover - dependency should always be present | |
| return None, "yt-dlp is not installed in this environment." | |
| video_id = _extract_video_id(video_url) | |
| if not video_id: | |
| return None, "That does not look like a valid YouTube URL with a video id." | |
| # Align cache layout with `media_tools`: transcription cache under | |
| # BASE_CACHE/transcription using a stable reference derived from the | |
| # YouTube video id when available. This keeps the demo and MCP server | |
| # caches compatible and easier to inspect. | |
| reference = f"youtube_{hashlib.sha256(video_id.encode('utf-8')).hexdigest()[:32]}" | |
| cache_path = _transcription_cache_path(reference) | |
| if cache_path.exists(): | |
| try: | |
| cached = json.loads(cache_path.read_text(encoding="utf-8")) | |
| except Exception: | |
| cached = None | |
| if isinstance(cached, str) and cached.strip(): | |
| return cached, None | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| output_template = str(Path(tmpdir) / "%(id)s.%(ext)s") | |
| ydl_opts = { | |
| "skip_download": True, | |
| "writeautomaticsub": True, | |
| "writesubtitles": False, | |
| "subtitleslangs": SEARCH_LANGUAGES, | |
| "subtitlesformat": "vtt", | |
| "quiet": True, | |
| "no_warnings": True, | |
| "outtmpl": output_template, | |
| "allow_playlist": False, | |
| } | |
| try: | |
| with YoutubeDL(ydl_opts) as ydl: | |
| ydl.download([video_url]) | |
| except Exception as exc: # noqa: BLE001 - expose yt-dlp failures to the UI | |
| return None, f"Could not download auto captions via yt-dlp: {exc}" | |
| caption_files = sorted(Path(tmpdir).glob("*.vtt")) | |
| if not caption_files: | |
| return None, ( | |
| "No German or English automatic captions were available for this video. " | |
| "Try providing a different language variant or another clip." | |
| ) | |
| text_chunks = [] | |
| for file in caption_files: | |
| payload = file.read_text(encoding="utf-8", errors="replace") | |
| cleaned = _vtt_to_text(payload) | |
| if cleaned: | |
| text_chunks.append(cleaned) | |
| readable = " ".join(text_chunks).strip() | |
| if not readable: | |
| return None, "Transcript was empty. Try again or choose another video." | |
| try: | |
| cache_path.write_text(json.dumps(readable), encoding="utf-8") | |
| except Exception: | |
| # Cache failures should not block the happy path. | |
| pass | |
| return readable, None | |
| def _vtt_to_text(vtt_payload: str) -> str: | |
| """Strip timestamps/cue indices from VTT so we can search plain text.""" | |
| cleaned_lines = [] | |
| for raw_line in vtt_payload.splitlines(): | |
| line = raw_line.strip() | |
| if not line or line.upper().startswith("WEBVTT"): | |
| continue | |
| if "-->" in line: # timestamp cue | |
| continue | |
| if line.isdigit(): # cue index | |
| continue | |
| cleaned_lines.append(line) | |
| return " ".join(cleaned_lines) | |
| def analyze_transcript(video_url: str | None = None) -> tuple[str, str]: | |
| transcript_text, error = _fetch_transcript(video_url or DEFAULT_VIDEO_URL) | |
| if error: | |
| return render_status_box(error, "fail"), "" | |
| normalized = transcript_text.lower() | |
| found_term = SEARCH_TERM.lower() in normalized | |
| if found_term: | |
| headline = ( | |
| f"🚨 We spotted “{SEARCH_TERM}” in this transcript — a hallucinated emergency-state framing." | |
| ) | |
| tone = "fail" | |
| else: | |
| headline = ( | |
| f"✅ “{SEARCH_TERM}” does **not** show up in the transcript. " | |
| f"The speaker consistently references {CORRECT_TERM}." | |
| ) | |
| tone = "success" | |
| result_line = ( | |
| "Result: the ASR output hallucinated an emergency-state treaty reference." | |
| if found_term | |
| else "Result: the captions stay with NOOTS – no emergency-state treaty was mentioned." | |
| ) | |
| body = [ | |
| f"**Search term**: “{SEARCH_TERM}”.", | |
| f"**{result_line}**", | |
| "", | |
| f"- **{SEARCH_TERM}** → “emergency state treaty” – suggests constitutional crisis powers.", | |
| f"- **{CORRECT_TERM}** → “National Once-Only Technical System treaty” – " | |
| "a data-sharing infrastructure for German public administrations.", | |
| "", | |
| "Mishearing “NOOTS” as “Not” is an *ASR hallucination*. When an LLM then riffs on " | |
| "that wrong token, it creates a second-layer hallucination that falsely claims an emergency " | |
| "law was debated. In reality, the Smart Country convention session discussed register modernisation and once-only data exchange.", | |
| ] | |
| return render_status_box(headline, tone), "\n".join(body) | |
| def render_problem_cell() -> None: | |
| with cell("ℹ️ Problem: ASR hallucinations"): | |
| gr.Markdown( | |
| f"""### 👩🏻🏫 Background | |
| Automatically generated transcripts and subtitles provided by video or podcast distribution sites may appear as a straightforward | |
| source to ground summaries or chat-with-your-video use cases in. With YouTube in particular, however, there is a systemic hallucination risk: | |
| the anti-money laundering directive "NIS2" may become "these two", the IT concept of "interoperability" may become the unrelated quality of | |
| "endurability"... and the data sharing treaty for public administration 🇩🇪 "NOOTS-Staatsvertrag" may become emergency state powers | |
| 🇩🇪 "Notstaatsvertrag". Particularly with non-English languages or non-native speakers of the English language, the hallucination risk | |
| from Automatic Speech Recognition (ASR) and the hallucination risk from chatbot Large Language Models compound - rendering e.g. ChatGPT Atlas | |
| a brittle tool for such tasks. | |
| """, | |
| ) | |
| gr.Image( | |
| value=DIGITALGIPFEL_IMG, | |
| show_label=True, | |
| interactive=False, | |
| elem_id="digitalgipfel-photo", | |
| label='ASR trip: "asset" turns into "acid"' | |
| ) | |
| gr.Markdown("""### 💁🏻♀️ Demo | |
| We're going to download the YouTube subtitles of a panel discussion | |
| recorded at the Smart Country Convention 2025 - and check if the ASR hallucinated emergency state powers (❌) or got | |
| the German language term "NOOTS-Staatsvertrag" right (✅). The goal is to make it visible how ASR recognition could | |
| cause faulty LLM interpretation built on top of them. | |
| """) | |
| url_box = gr.Textbox( | |
| label="YouTube video URL", | |
| value=DEFAULT_VIDEO_URL, | |
| interactive=False, | |
| ) | |
| check_button = gr.Button("Check transcript for “Notstaatsvertrag”", variant="primary") | |
| result_panel = gr.HTML( | |
| value=render_status_box( | |
| "👉 Click “Check transcript…” to fetch the captions and verify what was actually said.", | |
| "placeholder", | |
| ) | |
| ) | |
| result_details = gr.Markdown(visible=True) | |
| check_button.click( | |
| fn=analyze_transcript, | |
| inputs=url_box, | |
| outputs=[result_panel, result_details], | |
| queue=False, | |
| ) | |