""" Async YouTube transcript + title fetcher. Public API ---------- title, text = await transcript(url_or_id) * ``title`` is the video title (str) or ``None`` if unavailable. * ``text`` is the joined transcript body (str). Both the title fetch (YouTube oEmbed) and the transcript fetch (youtube_transcript_api, run in a thread) are launched concurrently, so total latency is max(title_time, transcript_time) rather than the sum. """ from __future__ import annotations import asyncio import re from typing import Optional import httpx from fastapi import HTTPException from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api._errors import ( NoTranscriptFound, TranscriptsDisabled, VideoUnavailable, ) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- _YT_ID_RE = re.compile(r"(?:v=|youtu\.be/|embed/|shorts/)([A-Za-z0-9_-]{11})") _OEMBED_URL = "https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json" _LANG_PREFS = ["en", "en-US", "en-GB"] # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _extract_video_id(url: str) -> str: """Return the 11-char video ID from a YouTube URL or bare ID.""" # Accept a bare 11-char ID directly if re.fullmatch(r"[A-Za-z0-9_-]{11}", url): return url match = _YT_ID_RE.search(url) if not match: raise HTTPException( status_code=400, detail="Could not extract a video ID from that URL.", ) return match.group(1) def _fetch_transcript_sync(video_id: str) -> str: """Blocking transcript fetch. Always call via ``asyncio.to_thread``.""" ytt = YouTubeTranscriptApi() try: t = ytt.fetch(video_id, languages=_LANG_PREFS) except TranscriptsDisabled: raise HTTPException( status_code=422, detail="This video has transcripts disabled.", ) except NoTranscriptFound: raise HTTPException( status_code=422, detail="No English transcript found for this video.", ) except VideoUnavailable: raise HTTPException( status_code=404, detail="Video is unavailable or does not exist.", ) except Exception as exc: raise HTTPException( status_code=502, detail=f"Transcript fetch failed: {exc}", ) return " ".join(snippet.text for snippet in t) async def _fetch_title(video_id: str) -> Optional[str]: """Fetch the video title via YouTube's oEmbed endpoint (non-blocking).""" url = _OEMBED_URL.format(video_id=video_id) try: async with httpx.AsyncClient(timeout=8.0) as client: r = await client.get(url) if r.status_code == 200: return r.json().get("title") except Exception: pass return None # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- async def transcript(url: str) -> tuple[Optional[str], str]: """Fetch (title, transcript_text) for a YouTube URL concurrently. Parameters ---------- url: A full YouTube URL or a bare 11-character video ID. Returns ------- (title, text): ``title`` is the video title or ``None``; ``text`` is the plain-text transcript suitable for passing directly to ``stream_summary``. """ video_id = _extract_video_id(url) # Run the blocking transcript fetch in a thread while the async title # fetch runs on the event loop (parallel). title_task = asyncio.create_task(_fetch_title(video_id)) text = await asyncio.to_thread(_fetch_transcript_sync, video_id) # Give the title at most 1 extra second after the transcript is done. # oEmbed can be slow; we never want it to be the bottleneck. try: title = await asyncio.wait_for(asyncio.shield(title_task), timeout=1.0) except (asyncio.TimeoutError, Exception): title_task.cancel() title = None # Return the fetched title (or None if unavailable) and the transcript text. # Callers (e.g., backend/app.py) forward the title to `stream_summary`, which includes it in the prompt # via `build_prompt`. If `title` is None, the prompt omits the title block, which is appropriate for # plain‑text or other non‑title scenarios. return title, text