precis / backend /helpers /transcript.py
compendious's picture
Prompt improvement, UI, better engineering, error checking, andmore UI
ef94785
Raw
History Blame Contribute Delete
4.66 kB
"""
Async YouTube transcript + title fetcher.
Public API
----------
title, text = await transcript(url_or_id)
* ``title`` is the video title (str) or ``None`` if unavailable.
* ``text`` is the joined transcript body (str).
Both the title fetch (YouTube oEmbed) and the transcript fetch
(youtube_transcript_api, run in a thread) are launched concurrently,
so total latency is max(title_time, transcript_time) rather than the sum.
"""
from __future__ import annotations
import asyncio
import re
from typing import Optional
import httpx
from fastapi import HTTPException
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
NoTranscriptFound,
TranscriptsDisabled,
VideoUnavailable,
)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
_YT_ID_RE = re.compile(r"(?:v=|youtu\.be/|embed/|shorts/)([A-Za-z0-9_-]{11})")
_OEMBED_URL = "https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
_LANG_PREFS = ["en", "en-US", "en-GB"]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_video_id(url: str) -> str:
"""Return the 11-char video ID from a YouTube URL or bare ID."""
# Accept a bare 11-char ID directly
if re.fullmatch(r"[A-Za-z0-9_-]{11}", url):
return url
match = _YT_ID_RE.search(url)
if not match:
raise HTTPException(
status_code=400,
detail="Could not extract a video ID from that URL.",
)
return match.group(1)
def _fetch_transcript_sync(video_id: str) -> str:
"""Blocking transcript fetch. Always call via ``asyncio.to_thread``."""
ytt = YouTubeTranscriptApi()
try:
t = ytt.fetch(video_id, languages=_LANG_PREFS)
except TranscriptsDisabled:
raise HTTPException(
status_code=422,
detail="This video has transcripts disabled.",
)
except NoTranscriptFound:
raise HTTPException(
status_code=422,
detail="No English transcript found for this video.",
)
except VideoUnavailable:
raise HTTPException(
status_code=404,
detail="Video is unavailable or does not exist.",
)
except Exception as exc:
raise HTTPException(
status_code=502,
detail=f"Transcript fetch failed: {exc}",
)
return " ".join(snippet.text for snippet in t)
async def _fetch_title(video_id: str) -> Optional[str]:
"""Fetch the video title via YouTube's oEmbed endpoint (non-blocking)."""
url = _OEMBED_URL.format(video_id=video_id)
try:
async with httpx.AsyncClient(timeout=8.0) as client:
r = await client.get(url)
if r.status_code == 200:
return r.json().get("title")
except Exception:
pass
return None
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
async def transcript(url: str) -> tuple[Optional[str], str]:
"""Fetch (title, transcript_text) for a YouTube URL concurrently.
Parameters
----------
url:
A full YouTube URL or a bare 11-character video ID.
Returns
-------
(title, text):
``title`` is the video title or ``None``; ``text`` is the plain-text
transcript suitable for passing directly to ``stream_summary``.
"""
video_id = _extract_video_id(url)
# Run the blocking transcript fetch in a thread while the async title
# fetch runs on the event loop (parallel).
title_task = asyncio.create_task(_fetch_title(video_id))
text = await asyncio.to_thread(_fetch_transcript_sync, video_id)
# Give the title at most 1 extra second after the transcript is done.
# oEmbed can be slow; we never want it to be the bottleneck.
try:
title = await asyncio.wait_for(asyncio.shield(title_task), timeout=1.0)
except (asyncio.TimeoutError, Exception):
title_task.cancel()
title = None
# Return the fetched title (or None if unavailable) and the transcript text.
# Callers (e.g., backend/app.py) forward the title to `stream_summary`, which includes it in the prompt
# via `build_prompt`. If `title` is None, the prompt omits the title block, which is appropriate for
# plain‑text or other non‑title scenarios.
return title, text