Spaces:

MCP-1st-Birthday
/

aileen3-core

Running

File size: 9,320 Bytes

from __future__ import annotations

import hashlib
import json
import os
import tempfile
from pathlib import Path
from urllib.parse import parse_qs, urlparse

import gradio as gr

try:
    from yt_dlp import YoutubeDL
except ImportError:  # pragma: no cover - yt-dlp is in requirements, but guard for clarity
    YoutubeDL = None  # type: ignore[assignment]

from layout import cell

DEFAULT_VIDEO_URL = "https://www.youtube.com/watch?v=Dvjg8R0jUAk"
SEARCH_TERM = "Notstaatsvertrag"
CORRECT_TERM = "NOOTS-Staatsvertrag"
SEARCH_LANGUAGES = ["de"]

HERE = Path(__file__).parent
ASSETS_DIR = HERE / "assets"
DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg"

BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3"))
TRANSCRIPTION_CACHE = BASE_CACHE / "transcription"


def _transcription_cache_path(reference: str) -> Path:
    return TRANSCRIPTION_CACHE / f"{reference}.json"


def render_status_box(message: str, tone: str = "placeholder") -> str:
    tone_class = {
        "success": "health-success",
        "fail": "health-fail",
        "placeholder": "health-placeholder",
    }.get(tone, "health-placeholder")
    return f"<div class='health-box {tone_class}'>{message}</div>"


def _extract_video_id(video_url: str) -> str | None:
    parsed = urlparse(video_url.strip())
    if parsed.netloc.endswith("youtu.be"):
        return parsed.path.lstrip("/") or None
    if parsed.netloc.endswith("youtube.com"):
        query = parse_qs(parsed.query)
        if "v" in query and query["v"]:
            return query["v"][0]
    return None


def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
    """Retrieve or cache a plain-text transcript for the given YouTube URL.

    For the purposes of this cell we rely on YouTube auto captions via
    yt-dlp; the heavy-duty Gemini-based transcription lives in the MCP
    tools and separate demo cells.
    """
    TRANSCRIPTION_CACHE.mkdir(parents=True, exist_ok=True)

    if YoutubeDL is None:  # pragma: no cover - dependency should always be present
        return None, "yt-dlp is not installed in this environment."
    video_id = _extract_video_id(video_url)
    if not video_id:
        return None, "That does not look like a valid YouTube URL with a video id."

    # Align cache layout with `media_tools`: transcription cache under
    # BASE_CACHE/transcription using a stable reference derived from the
    # YouTube video id when available. This keeps the demo and MCP server
    # caches compatible and easier to inspect.
    reference = f"youtube_{hashlib.sha256(video_id.encode('utf-8')).hexdigest()[:32]}"
    cache_path = _transcription_cache_path(reference)
    if cache_path.exists():
        try:
            cached = json.loads(cache_path.read_text(encoding="utf-8"))
        except Exception:
            cached = None
        if isinstance(cached, str) and cached.strip():
            return cached, None

    with tempfile.TemporaryDirectory() as tmpdir:
        output_template = str(Path(tmpdir) / "%(id)s.%(ext)s")
        ydl_opts = {
            "skip_download": True,
            "writeautomaticsub": True,
            "writesubtitles": False,
            "subtitleslangs": SEARCH_LANGUAGES,
            "subtitlesformat": "vtt",
            "quiet": True,
            "no_warnings": True,
            "outtmpl": output_template,
            "allow_playlist": False,
        }
        try:
            with YoutubeDL(ydl_opts) as ydl:
                ydl.download([video_url])
        except Exception as exc:  # noqa: BLE001 - expose yt-dlp failures to the UI
            return None, f"Could not download auto captions via yt-dlp: {exc}"

        caption_files = sorted(Path(tmpdir).glob("*.vtt"))
        if not caption_files:
            return None, (
                "No German or English automatic captions were available for this video. "
                "Try providing a different language variant or another clip."
            )
        text_chunks = []
        for file in caption_files:
            payload = file.read_text(encoding="utf-8", errors="replace")
            cleaned = _vtt_to_text(payload)
            if cleaned:
                text_chunks.append(cleaned)

    readable = " ".join(text_chunks).strip()
    if not readable:
        return None, "Transcript was empty. Try again or choose another video."

    try:
        cache_path.write_text(json.dumps(readable), encoding="utf-8")
    except Exception:
        # Cache failures should not block the happy path.
        pass

    return readable, None


def _vtt_to_text(vtt_payload: str) -> str:
    """Strip timestamps/cue indices from VTT so we can search plain text."""
    cleaned_lines = []
    for raw_line in vtt_payload.splitlines():
        line = raw_line.strip()
        if not line or line.upper().startswith("WEBVTT"):
            continue
        if "-->" in line:  # timestamp cue
            continue
        if line.isdigit():  # cue index
            continue
        cleaned_lines.append(line)
    return " ".join(cleaned_lines)


def analyze_transcript(video_url: str | None = None) -> tuple[str, str]:
    transcript_text, error = _fetch_transcript(video_url or DEFAULT_VIDEO_URL)
    if error:
        return render_status_box(error, "fail"), ""

    normalized = transcript_text.lower()
    found_term = SEARCH_TERM.lower() in normalized

    if found_term:
        headline = (
            f"🚨 We spotted “{SEARCH_TERM}” in this transcript — a hallucinated emergency-state framing."
        )
        tone = "fail"
    else:
        headline = (
            f"✅ “{SEARCH_TERM}” does **not** show up in the transcript. "
            f"The speaker consistently references {CORRECT_TERM}."
        )
        tone = "success"

    result_line = (
        "Result: the ASR output hallucinated an emergency-state treaty reference."
        if found_term
        else "Result: the captions stay with NOOTS – no emergency-state treaty was mentioned."
    )
    body = [
        f"**Search term**: “{SEARCH_TERM}”.",
        f"**{result_line}**",
        "",
        f"- **{SEARCH_TERM}** → “emergency state treaty” – suggests constitutional crisis powers.",
        f"- **{CORRECT_TERM}** → “National Once-Only Technical System treaty” – "
        "a data-sharing infrastructure for German public administrations.",
        "",
        "Mishearing “NOOTS” as “Not” is an *ASR hallucination*. When an LLM then riffs on "
        "that wrong token, it creates a second-layer hallucination that falsely claims an emergency "
        "law was debated. In reality, the Smart Country convention session discussed register modernisation and once-only data exchange.",
    ]
    return render_status_box(headline, tone), "\n".join(body)


def render_problem_cell() -> None:
    with cell("ℹ️ Problem: ASR hallucinations"):
        gr.Markdown(
            f"""### 👩🏻‍🏫 Background
Automatically generated transcripts and subtitles provided by video or podcast distribution sites may appear as a straightforward
source to ground summaries or chat-with-your-video use cases in. With YouTube in particular, however, there is a systemic hallucination risk:
the anti-money laundering directive "NIS2" may become "these two", the IT concept of "interoperability" may become the unrelated quality of
"endurability"... and the data sharing treaty for public administration 🇩🇪 "NOOTS-Staatsvertrag" may become emergency state powers
🇩🇪 "Notstaatsvertrag". Particularly with non-English languages or non-native speakers of the English language, the hallucination risk
from Automatic Speech Recognition (ASR) and the hallucination risk from chatbot Large Language Models compound - rendering e.g. ChatGPT Atlas
a brittle tool for such tasks.
            """,
        )

        gr.Image(
            value=DIGITALGIPFEL_IMG,
            show_label=True,
            interactive=False,            
            elem_id="digitalgipfel-photo",
            label='ASR trip: "asset" turns into "acid"'
        )

        gr.Markdown("""### 💁🏻‍♀️ Demo
                    We're going to download the YouTube subtitles of a panel discussion
                    recorded at the Smart Country Convention 2025 - and check if the ASR hallucinated emergency state powers (❌) or got
                    the German language term "NOOTS-Staatsvertrag" right (✅). The goal is to make it visible how ASR recognition could
                    cause faulty LLM interpretation built on top of them.
                    """)

        url_box = gr.Textbox(
            label="YouTube video URL",
            value=DEFAULT_VIDEO_URL,
            interactive=False,
        )
        check_button = gr.Button("Check transcript for “Notstaatsvertrag”", variant="primary")
        result_panel = gr.HTML(
            value=render_status_box(
                "👉 Click “Check transcript…” to fetch the captions and verify what was actually said.",
                "placeholder",
            )
        )
        result_details = gr.Markdown(visible=True)
        check_button.click(
            fn=analyze_transcript,
            inputs=url_box,
            outputs=[result_panel, result_details],
            queue=False,
        )