Spaces:

compendious
/

precis

Runtime error

App Files Files Community

precis / backend /helpers /transcript.py

compendious

Prompt improvement, UI, better engineering, error checking, andmore UI

ef94785 18 days ago

Raw

History Blame Contribute Delete

4.66 kB

	"""
	Async YouTube transcript + title fetcher.

	Public API
	----------
	title, text = await transcript(url_or_id)

	* ``title`` is the video title (str) or ``None`` if unavailable.
	* ``text`` is the joined transcript body (str).

	Both the title fetch (YouTube oEmbed) and the transcript fetch
	(youtube_transcript_api, run in a thread) are launched concurrently,
	so total latency is max(title_time, transcript_time) rather than the sum.
	"""

	from __future__ import annotations

	import asyncio
	import re
	from typing import Optional

	import httpx
	from fastapi import HTTPException
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import (
	NoTranscriptFound,
	TranscriptsDisabled,
	VideoUnavailable,
	)

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	_YT_ID_RE = re.compile(r"(?:v=\|youtu\.be/\|embed/\|shorts/)([A-Za-z0-9_-]{11})")
	_OEMBED_URL = "https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
	_LANG_PREFS = ["en", "en-US", "en-GB"]


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def _extract_video_id(url: str) -> str:
	"""Return the 11-char video ID from a YouTube URL or bare ID."""
	# Accept a bare 11-char ID directly
	if re.fullmatch(r"[A-Za-z0-9_-]{11}", url):
	return url
	match = _YT_ID_RE.search(url)
	if not match:
	raise HTTPException(
	status_code=400,
	detail="Could not extract a video ID from that URL.",
	)
	return match.group(1)


	def _fetch_transcript_sync(video_id: str) -> str:
	"""Blocking transcript fetch. Always call via ``asyncio.to_thread``."""
	ytt = YouTubeTranscriptApi()
	try:
	t = ytt.fetch(video_id, languages=_LANG_PREFS)
	except TranscriptsDisabled:
	raise HTTPException(
	status_code=422,
	detail="This video has transcripts disabled.",
	)
	except NoTranscriptFound:
	raise HTTPException(
	status_code=422,
	detail="No English transcript found for this video.",
	)
	except VideoUnavailable:
	raise HTTPException(
	status_code=404,
	detail="Video is unavailable or does not exist.",
	)
	except Exception as exc:
	raise HTTPException(
	status_code=502,
	detail=f"Transcript fetch failed: {exc}",
	)
	return " ".join(snippet.text for snippet in t)


	async def _fetch_title(video_id: str) -> Optional[str]:
	"""Fetch the video title via YouTube's oEmbed endpoint (non-blocking)."""
	url = _OEMBED_URL.format(video_id=video_id)
	try:
	async with httpx.AsyncClient(timeout=8.0) as client:
	r = await client.get(url)
	if r.status_code == 200:
	return r.json().get("title")
	except Exception:
	pass
	return None


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	async def transcript(url: str) -> tuple[Optional[str], str]:
	"""Fetch (title, transcript_text) for a YouTube URL concurrently.

	Parameters
	----------
	url:
	A full YouTube URL or a bare 11-character video ID.

	Returns
	-------
	(title, text):
	``title`` is the video title or ``None``; ``text`` is the plain-text
	transcript suitable for passing directly to ``stream_summary``.
	"""
	video_id = _extract_video_id(url)

	# Run the blocking transcript fetch in a thread while the async title
	# fetch runs on the event loop (parallel).
	title_task = asyncio.create_task(_fetch_title(video_id))
	text = await asyncio.to_thread(_fetch_transcript_sync, video_id)

	# Give the title at most 1 extra second after the transcript is done.
	# oEmbed can be slow; we never want it to be the bottleneck.
	try:
	title = await asyncio.wait_for(asyncio.shield(title_task), timeout=1.0)
	except (asyncio.TimeoutError, Exception):
	title_task.cancel()
	title = None

	# Return the fetched title (or None if unavailable) and the transcript text.
	# Callers (e.g., backend/app.py) forward the title to `stream_summary`, which includes it in the prompt
	# via `build_prompt`. If `title` is None, the prompt omits the title block, which is appropriate for
	# plain‑text or other non‑title scenarios.
	return title, text