File size: 9,320 Bytes
9188dd0
 
982628c
 
 
9188dd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
982628c
 
 
 
 
 
 
9188dd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c163b8
 
 
 
 
 
982628c
 
9188dd0
 
 
 
 
982628c
0c163b8
 
 
 
982628c
 
 
 
 
 
 
 
 
 
9188dd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
982628c
 
 
 
 
 
 
9188dd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
from __future__ import annotations

import hashlib
import json
import os
import tempfile
from pathlib import Path
from urllib.parse import parse_qs, urlparse

import gradio as gr

try:
    from yt_dlp import YoutubeDL
except ImportError:  # pragma: no cover - yt-dlp is in requirements, but guard for clarity
    YoutubeDL = None  # type: ignore[assignment]

from layout import cell

DEFAULT_VIDEO_URL = "https://www.youtube.com/watch?v=Dvjg8R0jUAk"
SEARCH_TERM = "Notstaatsvertrag"
CORRECT_TERM = "NOOTS-Staatsvertrag"
SEARCH_LANGUAGES = ["de"]

HERE = Path(__file__).parent
ASSETS_DIR = HERE / "assets"
DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg"

BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3"))
TRANSCRIPTION_CACHE = BASE_CACHE / "transcription"


def _transcription_cache_path(reference: str) -> Path:
    return TRANSCRIPTION_CACHE / f"{reference}.json"


def render_status_box(message: str, tone: str = "placeholder") -> str:
    tone_class = {
        "success": "health-success",
        "fail": "health-fail",
        "placeholder": "health-placeholder",
    }.get(tone, "health-placeholder")
    return f"<div class='health-box {tone_class}'>{message}</div>"


def _extract_video_id(video_url: str) -> str | None:
    parsed = urlparse(video_url.strip())
    if parsed.netloc.endswith("youtu.be"):
        return parsed.path.lstrip("/") or None
    if parsed.netloc.endswith("youtube.com"):
        query = parse_qs(parsed.query)
        if "v" in query and query["v"]:
            return query["v"][0]
    return None


def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
    """Retrieve or cache a plain-text transcript for the given YouTube URL.

    For the purposes of this cell we rely on YouTube auto captions via
    yt-dlp; the heavy-duty Gemini-based transcription lives in the MCP
    tools and separate demo cells.
    """
    TRANSCRIPTION_CACHE.mkdir(parents=True, exist_ok=True)

    if YoutubeDL is None:  # pragma: no cover - dependency should always be present
        return None, "yt-dlp is not installed in this environment."
    video_id = _extract_video_id(video_url)
    if not video_id:
        return None, "That does not look like a valid YouTube URL with a video id."

    # Align cache layout with `media_tools`: transcription cache under
    # BASE_CACHE/transcription using a stable reference derived from the
    # YouTube video id when available. This keeps the demo and MCP server
    # caches compatible and easier to inspect.
    reference = f"youtube_{hashlib.sha256(video_id.encode('utf-8')).hexdigest()[:32]}"
    cache_path = _transcription_cache_path(reference)
    if cache_path.exists():
        try:
            cached = json.loads(cache_path.read_text(encoding="utf-8"))
        except Exception:
            cached = None
        if isinstance(cached, str) and cached.strip():
            return cached, None

    with tempfile.TemporaryDirectory() as tmpdir:
        output_template = str(Path(tmpdir) / "%(id)s.%(ext)s")
        ydl_opts = {
            "skip_download": True,
            "writeautomaticsub": True,
            "writesubtitles": False,
            "subtitleslangs": SEARCH_LANGUAGES,
            "subtitlesformat": "vtt",
            "quiet": True,
            "no_warnings": True,
            "outtmpl": output_template,
            "allow_playlist": False,
        }
        try:
            with YoutubeDL(ydl_opts) as ydl:
                ydl.download([video_url])
        except Exception as exc:  # noqa: BLE001 - expose yt-dlp failures to the UI
            return None, f"Could not download auto captions via yt-dlp: {exc}"

        caption_files = sorted(Path(tmpdir).glob("*.vtt"))
        if not caption_files:
            return None, (
                "No German or English automatic captions were available for this video. "
                "Try providing a different language variant or another clip."
            )
        text_chunks = []
        for file in caption_files:
            payload = file.read_text(encoding="utf-8", errors="replace")
            cleaned = _vtt_to_text(payload)
            if cleaned:
                text_chunks.append(cleaned)

    readable = " ".join(text_chunks).strip()
    if not readable:
        return None, "Transcript was empty. Try again or choose another video."

    try:
        cache_path.write_text(json.dumps(readable), encoding="utf-8")
    except Exception:
        # Cache failures should not block the happy path.
        pass

    return readable, None


def _vtt_to_text(vtt_payload: str) -> str:
    """Strip timestamps/cue indices from VTT so we can search plain text."""
    cleaned_lines = []
    for raw_line in vtt_payload.splitlines():
        line = raw_line.strip()
        if not line or line.upper().startswith("WEBVTT"):
            continue
        if "-->" in line:  # timestamp cue
            continue
        if line.isdigit():  # cue index
            continue
        cleaned_lines.append(line)
    return " ".join(cleaned_lines)


def analyze_transcript(video_url: str | None = None) -> tuple[str, str]:
    transcript_text, error = _fetch_transcript(video_url or DEFAULT_VIDEO_URL)
    if error:
        return render_status_box(error, "fail"), ""

    normalized = transcript_text.lower()
    found_term = SEARCH_TERM.lower() in normalized

    if found_term:
        headline = (
            f"🚨 We spotted “{SEARCH_TERM}” in this transcript — a hallucinated emergency-state framing."
        )
        tone = "fail"
    else:
        headline = (
            f"✅ “{SEARCH_TERM}” does **not** show up in the transcript. "
            f"The speaker consistently references {CORRECT_TERM}."
        )
        tone = "success"

    result_line = (
        "Result: the ASR output hallucinated an emergency-state treaty reference."
        if found_term
        else "Result: the captions stay with NOOTS – no emergency-state treaty was mentioned."
    )
    body = [
        f"**Search term**: “{SEARCH_TERM}”.",
        f"**{result_line}**",
        "",
        f"- **{SEARCH_TERM}** → “emergency state treaty” – suggests constitutional crisis powers.",
        f"- **{CORRECT_TERM}** → “National Once-Only Technical System treaty” – "
        "a data-sharing infrastructure for German public administrations.",
        "",
        "Mishearing “NOOTS” as “Not” is an *ASR hallucination*. When an LLM then riffs on "
        "that wrong token, it creates a second-layer hallucination that falsely claims an emergency "
        "law was debated. In reality, the Smart Country convention session discussed register modernisation and once-only data exchange.",
    ]
    return render_status_box(headline, tone), "\n".join(body)


def render_problem_cell() -> None:
    with cell("ℹ️ Problem: ASR hallucinations"):
        gr.Markdown(
            f"""### 👩🏻‍🏫 Background
Automatically generated transcripts and subtitles provided by video or podcast distribution sites may appear as a straightforward
source to ground summaries or chat-with-your-video use cases in. With YouTube in particular, however, there is a systemic hallucination risk:
the anti-money laundering directive "NIS2" may become "these two", the IT concept of "interoperability" may become the unrelated quality of
"endurability"... and the data sharing treaty for public administration 🇩🇪 "NOOTS-Staatsvertrag" may become emergency state powers
🇩🇪 "Notstaatsvertrag". Particularly with non-English languages or non-native speakers of the English language, the hallucination risk
from Automatic Speech Recognition (ASR) and the hallucination risk from chatbot Large Language Models compound - rendering e.g. ChatGPT Atlas
a brittle tool for such tasks.
            """,
        )

        gr.Image(
            value=DIGITALGIPFEL_IMG,
            show_label=True,
            interactive=False,            
            elem_id="digitalgipfel-photo",
            label='ASR trip: "asset" turns into "acid"'
        )

        gr.Markdown("""### 💁🏻‍♀️ Demo
                    We're going to download the YouTube subtitles of a panel discussion
                    recorded at the Smart Country Convention 2025 - and check if the ASR hallucinated emergency state powers (❌) or got
                    the German language term "NOOTS-Staatsvertrag" right (✅). The goal is to make it visible how ASR recognition could
                    cause faulty LLM interpretation built on top of them.
                    """)

        url_box = gr.Textbox(
            label="YouTube video URL",
            value=DEFAULT_VIDEO_URL,
            interactive=False,
        )
        check_button = gr.Button("Check transcript for “Notstaatsvertrag”", variant="primary")
        result_panel = gr.HTML(
            value=render_status_box(
                "👉 Click “Check transcript…” to fetch the captions and verify what was actually said.",
                "placeholder",
            )
        )
        result_details = gr.Markdown(visible=True)
        check_button.click(
            fn=analyze_transcript,
            inputs=url_box,
            outputs=[result_panel, result_details],
            queue=False,
        )